1
2
3
4
5 """
6 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 Note
12 ====
13 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
14 a dot/period (".") in a sequence is interpreted as meaning the same
15 character as in the first sequence. The PHYLIP 3.6 documentation says:
16
17 "a period was also previously allowed but it is no longer allowed,
18 because it sometimes is used in different senses in other programs"
19
20 At the time of writing, we do nothing special with a dot/period.
21 """
22
23
24 try:
25 set = set
26 except NameError:
27 from sets import Set as set
28
29 from Bio.Alphabet import single_letter_alphabet
30 from Bio.Align.Generic import Alignment
31 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
32
34 """Phylip alignment writer."""
36 """Use this to write (another) single alignment to an open file.
37
38 This code will write interlaced alignments (when the sequences are
39 longer than 50 characters).
40
41 Note that record identifiers are strictly truncated at 10 characters.
42
43 For more information on the file format, please see:
44 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
45 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
46 """
47 truncate=10
48 records = alignment.get_all_seqs()
49 handle = self.handle
50
51 if len(records)==0 :
52 raise ValueError("Must have at least one sequence")
53 length_of_seqs = alignment.get_alignment_length()
54 for record in records :
55 if length_of_seqs != len(record.seq) :
56 raise ValueError("Sequences must all be the same length")
57 if length_of_seqs <= 0 :
58 raise ValueError("Non-empty sequences are required")
59
60 if len(records) > len(set([r.id[:truncate] for r in records])) :
61 raise ValueError("Repeated identifier, possibly due to truncation")
62
63
64
65
66
67
68
69 handle.write(" %i %s\n" % (len(records), length_of_seqs))
70 block=0
71 while True :
72 for record in records :
73 if block==0 :
74
75 """
76 Quoting the PHYLIP version 3.6 documentation:
77
78 The name should be ten characters in length, filled out to
79 the full ten characters by blanks if shorter. Any printable
80 ASCII/ISO character is allowed in the name, except for
81 parentheses ("(" and ")"), square brackets ("[" and "]"),
82 colon (":"), semicolon (";") and comma (","). If you forget
83 to extend the names to ten characters in length by blanks,
84 the program [i.e. PHYLIP] will get out of synchronization
85 with the contents of the data file, and an error message will
86 result.
87
88 Note that Tab characters count as only one character in the
89 species names. Their inclusion can cause trouble.
90 """
91 name = record.id.strip()
92
93
94 for char in "[]()," :
95 name = name.replace(char,"")
96 for char in ":;" :
97 name = name.replace(char,"|")
98
99
100 handle.write(name[:truncate].ljust(truncate))
101 else :
102
103 handle.write(" "*truncate)
104
105 for chunk in range(0,5) :
106 i = block*50 + chunk*10
107 seq_segment = record.seq.tostring()[i:i+10]
108
109
110 handle.write(" %s" % seq_segment)
111 if i+10 > length_of_seqs : break
112 handle.write("\n")
113 block=block+1
114 if block*50 > length_of_seqs : break
115 handle.write("\n")
116
118 """Reads a Phylip alignment file returning an Alignment object iterator.
119
120 Record identifiers are limited to at most 10 characters.
121
122 It only copes with interlaced phylip files! Sequential files won't work
123 where the sequences are split over multiple lines.
124
125 For more information on the file format, please see:
126 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
127 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
128 """
129
131 line = line.strip()
132 parts = filter(None, line.split())
133 if len(parts)!=2 :
134 return False
135 try :
136 number_of_seqs = int(parts[0])
137 length_of_seqs = int(parts[1])
138 return True
139 except ValueError:
140 return False
141
143 handle = self.handle
144
145 try :
146
147
148 line = self._header
149 del self._header
150 except AttributeError :
151 line = handle.readline()
152
153 if not line: return
154 line = line.strip()
155 parts = filter(None, line.split())
156 if len(parts)!=2 :
157 raise ValueError("First line should have two integers")
158 try :
159 number_of_seqs = int(parts[0])
160 length_of_seqs = int(parts[1])
161 except ValueError:
162 raise ValueError("First line should have two integers")
163
164 assert self._is_header(line)
165
166 if self.records_per_alignment is not None \
167 and self.records_per_alignment != number_of_seqs :
168 raise ValueError("Found %i records in this alignment, told to expect %i" \
169 % (number_of_seqs, self.records_per_alignment))
170
171 ids = []
172 seqs = []
173
174
175
176 for i in range(0,number_of_seqs) :
177 line = handle.readline().rstrip()
178 ids.append(line[:10].strip())
179 seqs.append([line[10:].strip().replace(" ","")])
180
181
182 line=""
183 while True :
184
185 while ""==line.strip():
186 line = handle.readline()
187 if not line : break
188 if not line : break
189
190 if self._is_header(line) :
191
192 self._header = line
193 break
194
195
196 for i in range(0,number_of_seqs) :
197 seqs[i].append(line.strip().replace(" ",""))
198 line = handle.readline()
199 if (not line) and i+1 < number_of_seqs :
200 raise ValueError("End of file mid-block")
201 if not line : break
202
203 alignment = Alignment(self.alphabet)
204 for i in range(0,number_of_seqs) :
205 seq = "".join(seqs[i])
206 if len(seq)!=length_of_seqs :
207 raise ValueError("Sequence %i length %i, expected length %i" \
208 % (i+1, len(seq), length_of_seqs))
209 alignment.add_sequence(ids[i], seq)
210
211 record = alignment.get_all_seqs()[-1]
212 assert ids[i] == record.id or ids[i] == record.description
213 record.id = ids[i]
214 record.name = ids[i]
215 record.description = ids[i]
216 return alignment
217
218 if __name__=="__main__" :
219 print "Running short mini-test"
220
221 phylip_text=""" 8 286
222 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
223 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
224 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
225 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
226 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
227 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
228 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
229 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
230
231 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
232 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
233 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
234 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
235 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
236 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
237 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
238 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
239
240 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
241 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
242 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
243 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
244 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
245 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
246 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
247 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
248
249 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
250 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
251 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
252 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
253 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
254 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
255 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
256 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
257
258 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
259 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
260 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
261 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
262 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
263 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
264 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
265 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
266
267 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
268 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
269 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
270 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
271 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
272 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
273 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
274 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
275 """
276
277 from cStringIO import StringIO
278 handle = StringIO(phylip_text)
279 count=0
280 for alignment in PhylipIterator(handle) :
281 for record in alignment.get_all_seqs() :
282 count=count+1
283 print record.id
284
285 assert count == 8
286
287 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
288 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
289 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
290 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
291 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
292 assert record.seq.tostring().replace("-","") == expected
293
294
295
296 phylip_text2="""5 60
297 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
298 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
299 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
300 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
301 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
302
303 GAAATGGTCAATATTACAAGGT
304 GAAATGGTCAACATTAAAAGAT
305 GAAATCGTCAATATTAAAAGGT
306 GAAATGGTCAATCTTAAAAGGT
307 GAAATGGTCAATATTAAAAGGT"""
308
309 phylip_text3="""5 60
310 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
311 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
312 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
313 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
314 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
315
316 handle = StringIO(phylip_text2)
317 list2 = list(PhylipIterator(handle))
318 handle.close()
319 assert len(list2)==1
320 assert len(list2[0].get_all_seqs())==5
321
322 handle = StringIO(phylip_text3)
323 list3 = list(PhylipIterator(handle))
324 handle.close()
325 assert len(list3)==1
326 assert len(list3[0].get_all_seqs())==5
327
328 for i in range(0,5) :
329 list2[0].get_all_seqs()[i].id == list3[0].get_all_seqs()[i].id
330 list2[0].get_all_seqs()[i].seq.tostring() == list3[0].get_all_seqs()[i].seq.tostring()
331
332
333
334
335 phylip_text4=""" 5 42
336 Turkey AAGCTNGGGC ATTTCAGGGT
337 Salmo gairAAGCCTTGGC AGTGCAGGGT
338 H. SapiensACCGGTTGGC CGTTCAGGGT
339 Chimp AAACCCTTGC CGTTACGCTT
340 Gorilla AAACCCTTGC CGGTACGCTT
341
342 GAGCCCGGGC AATACAGGGT AT
343 GAGCCGTGGC CGGGCACGGT AT
344 ACAGGTTGGC CGTTCAGGGT AA
345 AAACCGAGGC CGGGACACTC AT
346 AAACCATTGC CGGTACGCTT AA"""
347
348
349
350 phylip_text5=""" 5 42
351 Turkey AAGCTNGGGC ATTTCAGGGT
352 GAGCCCGGGC AATACAGGGT AT
353 Salmo gairAAGCCTTGGC AGTGCAGGGT
354 GAGCCGTGGC CGGGCACGGT AT
355 H. SapiensACCGGTTGGC CGTTCAGGGT
356 ACAGGTTGGC CGTTCAGGGT AA
357 Chimp AAACCCTTGC CGTTACGCTT
358 AAACCGAGGC CGGGACACTC AT
359 Gorilla AAACCCTTGC CGGTACGCTT
360 AAACCATTGC CGGTACGCTT AA"""
361
362 phylip_text5a=""" 5 42
363 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
364 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
365 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
366 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
367 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
368
369 handle = StringIO(phylip_text4)
370 list4 = list(PhylipIterator(handle))
371 handle.close()
372 assert len(list4)==1
373 assert len(list4[0].get_all_seqs())==5
374
375 handle = StringIO(phylip_text5)
376 try :
377 list5 = list(PhylipIterator(handle))
378 assert len(list5)==1
379 assert len(list5[0].get_all_seqs())==5
380 print "That should have failed..."
381 except ValueError :
382 print "Evil multiline non-interlaced example failed as expected"
383 handle.close()
384
385 handle = StringIO(phylip_text5a)
386 list5 = list(PhylipIterator(handle))
387 handle.close()
388 assert len(list5)==1
389 assert len(list4[0].get_all_seqs())==5
390
391 print "Concatenation"
392 handle = StringIO(phylip_text4 + "\n" + phylip_text4)
393 assert len(list(PhylipIterator(handle))) == 2
394
395 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
396 assert len(list(PhylipIterator(handle))) == 3
397
398 print "OK"
399
400 print "Checking write/read"
401 handle = StringIO()
402 PhylipWriter(handle).write_file(list5)
403 handle.seek(0)
404 list6 = list(PhylipIterator(handle))
405 assert len(list5) == len(list6)
406 for a1,a2 in zip(list5, list6) :
407 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs())
408 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()) :
409 assert r1.id == r2.id
410 assert r1.seq.tostring() == r2.seq.tostring()
411 print "Done"
412