1
2
3
4
5 """
6 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 Note
12 ====
13 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
14 a dot/period (".") in a sequence is interpreted as meaning the same
15 character as in the first sequence. The PHYLIP 3.6 documentation says:
16
17 "a period was also previously allowed but it is no longer allowed,
18 because it sometimes is used in different senses in other programs"
19
20 At the time of writing, we do nothing special with a dot/period.
21 """
22
23 from Bio.Alphabet import single_letter_alphabet
24 from Bio.Align.Generic import Alignment
25 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
26
28 """Phylip alignment writer."""
30 """Use this to write (another) single alignment to an open file.
31
32 This code will write interlaced alignments (when the sequences are
33 longer than 50 characters).
34
35 Note that record identifiers are strictly truncated at 10 characters.
36
37 For more information on the file format, please see:
38 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
39 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
40 """
41 truncate=10
42 records = alignment.get_all_seqs()
43 handle = self.handle
44
45 if len(records)==0 :
46 raise ValueError("Must have at least one sequence")
47 length_of_seqs = alignment.get_alignment_length()
48 for record in records :
49 if length_of_seqs != len(record.seq) :
50 raise ValueError("Sequences must all be the same length")
51 if length_of_seqs <= 0 :
52 raise ValueError("Non-empty sequences are required")
53
54 if len(records) > len(set([r.id[:truncate] for r in records])) :
55 raise ValueError("Repeated identifier, possibly due to truncation")
56
57
58
59
60
61
62
63 handle.write(" %i %s\n" % (len(records), length_of_seqs))
64 block=0
65 while True :
66 for record in records :
67 if block==0 :
68
69 """
70 Quoting the PHYLIP version 3.6 documentation:
71
72 The name should be ten characters in length, filled out to
73 the full ten characters by blanks if shorter. Any printable
74 ASCII/ISO character is allowed in the name, except for
75 parentheses ("(" and ")"), square brackets ("[" and "]"),
76 colon (":"), semicolon (";") and comma (","). If you forget
77 to extend the names to ten characters in length by blanks,
78 the program [i.e. PHYLIP] will get out of synchronization
79 with the contents of the data file, and an error message will
80 result.
81
82 Note that Tab characters count as only one character in the
83 species names. Their inclusion can cause trouble.
84 """
85 name = record.id.strip()
86
87
88 for char in "[]()," :
89 name = name.replace(char,"")
90 for char in ":;" :
91 name = name.replace(char,"|")
92
93
94 handle.write(name[:truncate].ljust(truncate))
95 else :
96
97 handle.write(" "*truncate)
98
99 for chunk in range(0,5) :
100 i = block*50 + chunk*10
101 seq_segment = record.seq.tostring()[i:i+10]
102
103
104 handle.write(" %s" % seq_segment)
105 if i+10 > length_of_seqs : break
106 handle.write("\n")
107 block=block+1
108 if block*50 > length_of_seqs : break
109 handle.write("\n")
110
112 """Reads a Phylip alignment file returning an Alignment object iterator.
113
114 Record identifiers are limited to at most 10 characters.
115
116 It only copes with interlaced phylip files! Sequential files won't work
117 where the sequences are split over multiple lines.
118
119 For more information on the file format, please see:
120 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
121 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
122 """
123
125 line = line.strip()
126 parts = filter(None, line.split())
127 if len(parts)!=2 :
128 return False
129 try :
130 number_of_seqs = int(parts[0])
131 length_of_seqs = int(parts[1])
132 return True
133 except ValueError:
134 return False
135
137 handle = self.handle
138
139 try :
140
141
142 line = self._header
143 del self._header
144 except AttributeError :
145 line = handle.readline()
146
147 if not line: return
148 line = line.strip()
149 parts = filter(None, line.split())
150 if len(parts)!=2 :
151 raise ValueError("First line should have two integers")
152 try :
153 number_of_seqs = int(parts[0])
154 length_of_seqs = int(parts[1])
155 except ValueError:
156 raise ValueError("First line should have two integers")
157
158 assert self._is_header(line)
159
160 if self.records_per_alignment is not None \
161 and self.records_per_alignment != number_of_seqs :
162 raise ValueError("Found %i records in this alignment, told to expect %i" \
163 % (number_of_seqs, self.records_per_alignment))
164
165 ids = []
166 seqs = []
167
168
169
170 for i in range(0,number_of_seqs) :
171 line = handle.readline().rstrip()
172 ids.append(line[:10].strip())
173 seqs.append([line[10:].strip().replace(" ","")])
174
175
176 line=""
177 while True :
178
179 while ""==line.strip():
180 line = handle.readline()
181 if not line : break
182 if not line : break
183
184 if self._is_header(line) :
185
186 self._header = line
187 break
188
189
190 for i in range(0,number_of_seqs) :
191 seqs[i].append(line.strip().replace(" ",""))
192 line = handle.readline()
193 if (not line) and i+1 < number_of_seqs :
194 raise ValueError("End of file mid-block")
195 if not line : break
196
197 alignment = Alignment(self.alphabet)
198 for i in range(0,number_of_seqs) :
199 seq = "".join(seqs[i])
200 if len(seq)!=length_of_seqs :
201 raise ValueError("Sequence %i length %i, expected length %i" \
202 % (i+1, len(seq), length_of_seqs))
203 alignment.add_sequence(ids[i], seq)
204
205 record = alignment.get_all_seqs()[-1]
206 assert ids[i] == record.id or ids[i] == record.description
207 record.id = ids[i]
208 record.name = ids[i]
209 record.description = ids[i]
210 return alignment
211
212 if __name__=="__main__" :
213 print "Running short mini-test"
214
215 phylip_text=""" 8 286
216 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
217 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
218 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
219 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
220 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
221 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
222 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
223 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
224
225 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
226 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
227 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
228 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
229 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
230 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
231 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
232 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
233
234 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
235 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
236 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
237 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
238 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
239 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
240 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
241 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
242
243 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
244 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
245 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
246 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
247 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
248 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
249 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
250 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
251
252 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
253 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
254 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
255 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
256 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
257 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
258 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
259 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
260
261 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
262 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
263 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
264 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
265 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
266 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
267 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
268 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
269 """
270
271 from cStringIO import StringIO
272 handle = StringIO(phylip_text)
273 count=0
274 for alignment in PhylipIterator(handle) :
275 for record in alignment.get_all_seqs() :
276 count=count+1
277 print record.id
278
279 assert count == 8
280
281 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
282 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
283 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
284 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
285 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
286 assert record.seq.tostring().replace("-","") == expected
287
288
289
290 phylip_text2="""5 60
291 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
292 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
293 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
294 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
295 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
296
297 GAAATGGTCAATATTACAAGGT
298 GAAATGGTCAACATTAAAAGAT
299 GAAATCGTCAATATTAAAAGGT
300 GAAATGGTCAATCTTAAAAGGT
301 GAAATGGTCAATATTAAAAGGT"""
302
303 phylip_text3="""5 60
304 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
305 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
306 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
307 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
308 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
309
310 handle = StringIO(phylip_text2)
311 list2 = list(PhylipIterator(handle))
312 handle.close()
313 assert len(list2)==1
314 assert len(list2[0].get_all_seqs())==5
315
316 handle = StringIO(phylip_text3)
317 list3 = list(PhylipIterator(handle))
318 handle.close()
319 assert len(list3)==1
320 assert len(list3[0].get_all_seqs())==5
321
322 for i in range(0,5) :
323 list2[0].get_all_seqs()[i].id == list3[0].get_all_seqs()[i].id
324 list2[0].get_all_seqs()[i].seq.tostring() == list3[0].get_all_seqs()[i].seq.tostring()
325
326
327
328
329 phylip_text4=""" 5 42
330 Turkey AAGCTNGGGC ATTTCAGGGT
331 Salmo gairAAGCCTTGGC AGTGCAGGGT
332 H. SapiensACCGGTTGGC CGTTCAGGGT
333 Chimp AAACCCTTGC CGTTACGCTT
334 Gorilla AAACCCTTGC CGGTACGCTT
335
336 GAGCCCGGGC AATACAGGGT AT
337 GAGCCGTGGC CGGGCACGGT AT
338 ACAGGTTGGC CGTTCAGGGT AA
339 AAACCGAGGC CGGGACACTC AT
340 AAACCATTGC CGGTACGCTT AA"""
341
342
343
344 phylip_text5=""" 5 42
345 Turkey AAGCTNGGGC ATTTCAGGGT
346 GAGCCCGGGC AATACAGGGT AT
347 Salmo gairAAGCCTTGGC AGTGCAGGGT
348 GAGCCGTGGC CGGGCACGGT AT
349 H. SapiensACCGGTTGGC CGTTCAGGGT
350 ACAGGTTGGC CGTTCAGGGT AA
351 Chimp AAACCCTTGC CGTTACGCTT
352 AAACCGAGGC CGGGACACTC AT
353 Gorilla AAACCCTTGC CGGTACGCTT
354 AAACCATTGC CGGTACGCTT AA"""
355
356 phylip_text5a=""" 5 42
357 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
358 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
359 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
360 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
361 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
362
363 handle = StringIO(phylip_text4)
364 list4 = list(PhylipIterator(handle))
365 handle.close()
366 assert len(list4)==1
367 assert len(list4[0].get_all_seqs())==5
368
369 handle = StringIO(phylip_text5)
370 try :
371 list5 = list(PhylipIterator(handle))
372 assert len(list5)==1
373 assert len(list5[0].get_all_seqs())==5
374 print "That should have failed..."
375 except ValueError :
376 print "Evil multiline non-interlaced example failed as expected"
377 handle.close()
378
379 handle = StringIO(phylip_text5a)
380 list5 = list(PhylipIterator(handle))
381 handle.close()
382 assert len(list5)==1
383 assert len(list4[0].get_all_seqs())==5
384
385 print "Concatenation"
386 handle = StringIO(phylip_text4 + "\n" + phylip_text4)
387 assert len(list(PhylipIterator(handle))) == 2
388
389 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
390 assert len(list(PhylipIterator(handle))) == 3
391
392 print "OK"
393
394 print "Checking write/read"
395 handle = StringIO()
396 PhylipWriter(handle).write_file(list5)
397 handle.seek(0)
398 list6 = list(PhylipIterator(handle))
399 assert len(list5) == len(list6)
400 for a1,a2 in zip(list5, list6) :
401 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs())
402 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()) :
403 assert r1.id == r2.id
404 assert r1.seq.tostring() == r2.seq.tostring()
405 print "Done"
406