Package Bio :: Package SeqIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PhylipIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO support for the "phylip" (PHYLIP) file format. 
  8   
  9  You were expected to use this module via the Bio.SeqIO functions. 
 10  This module has now been replaced by Bio.AlignIO.PhylipIO, and is 
 11  deprecated.""" 
 12   
 13  import warnings 
 14  warnings.warn("Bio.SeqIO.PhylipIO is deprecated.  You can continue to read" \ 
 15                + " and write 'clustal' files with Bio.SeqIO, but this is now" \ 
 16                + " handled via Bio.AlignIO internally.", 
 17                DeprecationWarning) 
 18   
 19  from Bio.Alphabet import single_letter_alphabet 
 20  from Bio.Seq import Seq 
 21  from Bio.SeqRecord import SeqRecord 
 22  from Interfaces import SequenceWriter 
 23  from sets import Set 
 24   
 25  #This is a generator function! 
 26  #TODO - Should the default be Gapped(single_letter_alphabet) instead? 
27 -def PhylipIterator(handle, alphabet = single_letter_alphabet) :
28 """Reads a Phylip alignment file returning a SeqRecord object iterator. 29 30 Record identifiers are limited to at most 10 characters. 31 32 It only copes with interlaced phylip files! Sequential files won't work 33 where the sequences are split over multiple lines. 34 35 For more information on the file format, please see: 36 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 37 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 38 """ 39 line = handle.readline() 40 if not line: return 41 line = line.strip() 42 parts = filter(None, line.split()) 43 if len(parts)<>2 : 44 raise ValueError("First line should have two integers") 45 try : 46 number_of_seqs = int(parts[0]) 47 length_of_seqs = int(parts[1]) 48 except ValueError: 49 raise ValueError("First line should have two integers") 50 51 ids = [] 52 seqs = [] 53 54 #Expects STRICT truncation/padding to 10 characters 55 #Does not require any white space between name and seq. 56 for i in range(0,number_of_seqs) : 57 line = handle.readline().rstrip() 58 ids.append(line[:10].strip()) #first ten characters 59 seqs.append([line[10:].strip().replace(" ","")]) 60 61 line="" 62 while True : 63 #Skip any blank lines between blocks... 64 while ""==line.strip(): 65 line = handle.readline() 66 if not line : break #end of file 67 if not line : break 68 #print "New block..." 69 for i in range(0,number_of_seqs) : 70 seqs[i].append(line.strip().replace(" ","")) 71 line = handle.readline() 72 if (not line) and i+1 < number_of_seqs : 73 raise ValueError("End of file mid-block") 74 if not line : break #end of file 75 76 for i in range(0,number_of_seqs) : 77 seq = "".join(seqs[i]) 78 if len(seq)<>length_of_seqs : 79 raise ValueError("Sequence %i length %i, expected length %i" \ 80 % (i+1, len(seq), length_of_seqs)) 81 yield SeqRecord(Seq(seq, alphabet), id=ids[i], name=ids[i], description="")
82
83 -class PhylipWriter(SequenceWriter):
84 """Write interlaced Phylip sequence alignments. 85 86 For more information on the file format, please see: 87 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 88 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 89 90 All sequences must be the same length."""
91 - def __init__(self, handle, truncate=10):
92 """Creates the writer object 93 94 Use the method write_file() to actually record your sequence records.""" 95 self.handle = handle 96 self.truncate = truncate
97
98 - def write_file(self, records) :
99 """Use this to write an entire file containing the given records. 100 101 If records is an iterator that does not support len(records) or 102 records[index] then it is converted into a list. 103 """ 104 #Need length, and multiple passes - and iterator will not do. 105 records = list(records) 106 107 if len(records)==0 : 108 raise ValueError("Must have at least one sequence") 109 length_of_sequences = len(records[0].seq) 110 for record in records : 111 if length_of_sequences <> len(record.seq) : 112 raise ValueError("Sequences must all be the same length") 113 if length_of_sequences <= 0 : 114 raise ValueError("Non-empty sequences are required") 115 116 if len(records) > len(Set([r.id[:self.truncate] for r in records])) : 117 raise ValueError("Repeated identifier, possibly due to truncation") 118 119 handle = self.handle 120 121 # From experimentation, the use of tabs is not understood by the 122 # EMBOSS suite. The nature of the expected white space is not 123 # defined, simply "These are in free format, separated by blanks" 124 handle.write(" %i %s\n" % (len(records), length_of_sequences)) 125 block=0 126 while True : 127 for record in records : 128 if block==0 : 129 #Write name (truncated/padded to 10 characters) 130 """ 131 Quoting the PHYLIP version 3.6 documentation: 132 133 The name should be ten characters in length, filled out to 134 the full ten characters by blanks if shorter. Any printable 135 ASCII/ISO character is allowed in the name, except for 136 parentheses ("(" and ")"), square brackets ("[" and "]"), 137 colon (":"), semicolon (";") and comma (","). If you forget 138 to extend the names to ten characters in length by blanks, 139 the program [i.e. PHYLIP] will get out of synchronization 140 with the contents of the data file, and an error message 141 will result. 142 143 Note that Tab characters count as only one character in the 144 species names. Their inclusion can cause trouble. 145 """ 146 name = record.id.strip() 147 #Either remove the banned characters, or map them to something 148 #else like an underscore "_" or pipe "|" character... 149 for char in "[]()," : 150 name = name.replace(char,"") 151 for char in ":;" : 152 name = name.replace(char,"|") 153 154 #Now truncate and right pad to expected length. 155 handle.write(name[:self.truncate].ljust(self.truncate)) 156 else : 157 #write 10 space indent 158 handle.write(" "*self.truncate) 159 #Write five chunks of ten letters per line... 160 for chunk in range(0,5) : 161 i = block*50 + chunk*10 162 seq_segment = record.seq.tostring()[i:i+10] 163 #TODO - Force any gaps to be '-' character? Look at the alphabet... 164 #TODO - How to cope with '?' or '.' in the sequence? 165 handle.write(" %s" % seq_segment) 166 if i+10 > length_of_sequences : break 167 handle.write("\n") 168 block=block+1 169 if block*50 > length_of_sequences : break 170 handle.write("\n")
171 172 #Don't close the handle. Doing so would prevent this code 173 #from writing concatenated phylip files which are used 174 #in phylogenetic bootstrapping 175 #handle.close() 176 if __name__=="__main__" : 177 print "Testing" 178 179 phylip_text=""" 8 286 180 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 181 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 182 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 183 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 184 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 185 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 186 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 187 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 188 189 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 190 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 191 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 192 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 193 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 194 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 195 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 196 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 197 198 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 199 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 200 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 201 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 202 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 203 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 204 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 205 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 206 207 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 208 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 209 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 210 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 211 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 212 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 213 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 214 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 215 216 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 217 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 218 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 219 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 220 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 221 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 222 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 223 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 224 225 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 226 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 227 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 228 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 229 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 230 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 231 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 232 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 233 """ 234 235 from cStringIO import StringIO 236 handle = StringIO(phylip_text) 237 count=0 238 for record in PhylipIterator(handle) : 239 count=count+1 240 print record.id 241 #print record.seq.tostring() 242 assert count == 8 243 244 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 245 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 246 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 247 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 248 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper() 249 assert record.seq.tostring().replace("-","") == expected 250 251 #From here: 252 #http://atgc.lirmm.fr/phyml/usersguide.html 253 phylip_text2="""5 60 254 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 255 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 256 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 257 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 258 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 259 260 GAAATGGTCAATATTACAAGGT 261 GAAATGGTCAACATTAAAAGAT 262 GAAATCGTCAATATTAAAAGGT 263 GAAATGGTCAATCTTAAAAGGT 264 GAAATGGTCAATATTAAAAGGT""" 265 266 phylip_text3="""5 60 267 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 268 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 269 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 270 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 271 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 272 273 handle = StringIO(phylip_text2) 274 list2 = list(PhylipIterator(handle)) 275 handle.close() 276 assert len(list2)==5 277 278 handle = StringIO(phylip_text3) 279 list3 = list(PhylipIterator(handle)) 280 handle.close() 281 assert len(list3)==5 282 283 for i in range(0,5) : 284 list2[i].id == list3[i].id 285 list2[i].seq.tostring() == list3[i].seq.tostring() 286 287 #From here: 288 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 289 #Note the lack of any white space between names 2 and 3 and their seqs. 290 phylip_text4=""" 5 42 291 Turkey AAGCTNGGGC ATTTCAGGGT 292 Salmo gairAAGCCTTGGC AGTGCAGGGT 293 H. SapiensACCGGTTGGC CGTTCAGGGT 294 Chimp AAACCCTTGC CGTTACGCTT 295 Gorilla AAACCCTTGC CGGTACGCTT 296 297 GAGCCCGGGC AATACAGGGT AT 298 GAGCCGTGGC CGGGCACGGT AT 299 ACAGGTTGGC CGTTCAGGGT AA 300 AAACCGAGGC CGGGACACTC AT 301 AAACCATTGC CGGTACGCTT AA""" 302 303 #From here: 304 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 305 phylip_text5=""" 5 42 306 Turkey AAGCTNGGGC ATTTCAGGGT 307 GAGCCCGGGC AATACAGGGT AT 308 Salmo gairAAGCCTTGGC AGTGCAGGGT 309 GAGCCGTGGC CGGGCACGGT AT 310 H. SapiensACCGGTTGGC CGTTCAGGGT 311 ACAGGTTGGC CGTTCAGGGT AA 312 Chimp AAACCCTTGC CGTTACGCTT 313 AAACCGAGGC CGGGACACTC AT 314 Gorilla AAACCCTTGC CGGTACGCTT 315 AAACCATTGC CGGTACGCTT AA""" 316 317 phylip_text5a=""" 5 42 318 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 319 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 320 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 321 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 322 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 323 324 handle = StringIO(phylip_text4) 325 list4 = list(PhylipIterator(handle)) 326 handle.close() 327 assert len(list4)==5 328 329 handle = StringIO(phylip_text5) 330 try : 331 list5 = list(PhylipIterator(handle)) 332 assert len(list5)==5 333 print "That should have failed..." 334 except ValueError : 335 print "Evil multiline non-interlaced example failed as expected" 336 handle.close() 337 338 handle = StringIO(phylip_text5a) 339 list5 = list(PhylipIterator(handle)) 340 handle.close() 341 assert len(list5)==5 342 343 for i in range(0,5) : 344 list4[i].id == list5[i].id 345 list4[i].seq.tostring() == list5[i].seq.tostring() 346 347 348 349 """ 350 handle = StringIO(phylip_text) 351 out_handle=open("/tmp/test.phy","w") 352 writer = PhylipWriter(out_handle) 353 writer.write_file(PhylipIterator(handle)) 354 out_handle.close() 355 356 print "---------------------" 357 358 print open("/tmp/test.phy").read() 359 """ 360