1
2
3
4
5
6
7 """Bio.SeqIO support for the "phylip" (PHYLIP) file format.
8
9 You were expected to use this module via the Bio.SeqIO functions.
10 This module has now been replaced by Bio.AlignIO.PhylipIO, and is
11 deprecated."""
12
13 import warnings
14 warnings.warn("Bio.SeqIO.PhylipIO is deprecated. You can continue to read" \
15 + " and write 'clustal' files with Bio.SeqIO, but this is now" \
16 + " handled via Bio.AlignIO internally.",
17 DeprecationWarning)
18
19 from Bio.Alphabet import single_letter_alphabet
20 from Bio.Seq import Seq
21 from Bio.SeqRecord import SeqRecord
22 from Interfaces import SequenceWriter
23 from sets import Set
24
25
26
28 """Reads a Phylip alignment file returning a SeqRecord object iterator.
29
30 Record identifiers are limited to at most 10 characters.
31
32 It only copes with interlaced phylip files! Sequential files won't work
33 where the sequences are split over multiple lines.
34
35 For more information on the file format, please see:
36 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
37 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
38 """
39 line = handle.readline()
40 if not line: return
41 line = line.strip()
42 parts = filter(None, line.split())
43 if len(parts)<>2 :
44 raise ValueError("First line should have two integers")
45 try :
46 number_of_seqs = int(parts[0])
47 length_of_seqs = int(parts[1])
48 except ValueError:
49 raise ValueError("First line should have two integers")
50
51 ids = []
52 seqs = []
53
54
55
56 for i in range(0,number_of_seqs) :
57 line = handle.readline().rstrip()
58 ids.append(line[:10].strip())
59 seqs.append([line[10:].strip().replace(" ","")])
60
61 line=""
62 while True :
63
64 while ""==line.strip():
65 line = handle.readline()
66 if not line : break
67 if not line : break
68
69 for i in range(0,number_of_seqs) :
70 seqs[i].append(line.strip().replace(" ",""))
71 line = handle.readline()
72 if (not line) and i+1 < number_of_seqs :
73 raise ValueError("End of file mid-block")
74 if not line : break
75
76 for i in range(0,number_of_seqs) :
77 seq = "".join(seqs[i])
78 if len(seq)<>length_of_seqs :
79 raise ValueError("Sequence %i length %i, expected length %i" \
80 % (i+1, len(seq), length_of_seqs))
81 yield SeqRecord(Seq(seq, alphabet), id=ids[i], name=ids[i], description="")
82
84 """Write interlaced Phylip sequence alignments.
85
86 For more information on the file format, please see:
87 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
88 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
89
90 All sequences must be the same length."""
91 - def __init__(self, handle, truncate=10):
92 """Creates the writer object
93
94 Use the method write_file() to actually record your sequence records."""
95 self.handle = handle
96 self.truncate = truncate
97
99 """Use this to write an entire file containing the given records.
100
101 If records is an iterator that does not support len(records) or
102 records[index] then it is converted into a list.
103 """
104
105 records = list(records)
106
107 if len(records)==0 :
108 raise ValueError("Must have at least one sequence")
109 length_of_sequences = len(records[0].seq)
110 for record in records :
111 if length_of_sequences <> len(record.seq) :
112 raise ValueError("Sequences must all be the same length")
113 if length_of_sequences <= 0 :
114 raise ValueError("Non-empty sequences are required")
115
116 if len(records) > len(Set([r.id[:self.truncate] for r in records])) :
117 raise ValueError("Repeated identifier, possibly due to truncation")
118
119 handle = self.handle
120
121
122
123
124 handle.write(" %i %s\n" % (len(records), length_of_sequences))
125 block=0
126 while True :
127 for record in records :
128 if block==0 :
129
130 """
131 Quoting the PHYLIP version 3.6 documentation:
132
133 The name should be ten characters in length, filled out to
134 the full ten characters by blanks if shorter. Any printable
135 ASCII/ISO character is allowed in the name, except for
136 parentheses ("(" and ")"), square brackets ("[" and "]"),
137 colon (":"), semicolon (";") and comma (","). If you forget
138 to extend the names to ten characters in length by blanks,
139 the program [i.e. PHYLIP] will get out of synchronization
140 with the contents of the data file, and an error message
141 will result.
142
143 Note that Tab characters count as only one character in the
144 species names. Their inclusion can cause trouble.
145 """
146 name = record.id.strip()
147
148
149 for char in "[]()," :
150 name = name.replace(char,"")
151 for char in ":;" :
152 name = name.replace(char,"|")
153
154
155 handle.write(name[:self.truncate].ljust(self.truncate))
156 else :
157
158 handle.write(" "*self.truncate)
159
160 for chunk in range(0,5) :
161 i = block*50 + chunk*10
162 seq_segment = record.seq.tostring()[i:i+10]
163
164
165 handle.write(" %s" % seq_segment)
166 if i+10 > length_of_sequences : break
167 handle.write("\n")
168 block=block+1
169 if block*50 > length_of_sequences : break
170 handle.write("\n")
171
172
173
174
175
176 if __name__=="__main__" :
177 print "Testing"
178
179 phylip_text=""" 8 286
180 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
181 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
182 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
183 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
184 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
185 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
186 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
187 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
188
189 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
190 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
191 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
192 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
193 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
194 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
195 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
196 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
197
198 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
199 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
200 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
201 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
202 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
203 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
204 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
205 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
206
207 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
208 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
209 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
210 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
211 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
212 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
213 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
214 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
215
216 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
217 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
218 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
219 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
220 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
221 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
222 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
223 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
224
225 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
226 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
227 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
228 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
229 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
230 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
231 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
232 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
233 """
234
235 from cStringIO import StringIO
236 handle = StringIO(phylip_text)
237 count=0
238 for record in PhylipIterator(handle) :
239 count=count+1
240 print record.id
241
242 assert count == 8
243
244 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
245 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
246 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
247 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
248 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
249 assert record.seq.tostring().replace("-","") == expected
250
251
252
253 phylip_text2="""5 60
254 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
255 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
256 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
257 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
258 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
259
260 GAAATGGTCAATATTACAAGGT
261 GAAATGGTCAACATTAAAAGAT
262 GAAATCGTCAATATTAAAAGGT
263 GAAATGGTCAATCTTAAAAGGT
264 GAAATGGTCAATATTAAAAGGT"""
265
266 phylip_text3="""5 60
267 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
268 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
269 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
270 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
271 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
272
273 handle = StringIO(phylip_text2)
274 list2 = list(PhylipIterator(handle))
275 handle.close()
276 assert len(list2)==5
277
278 handle = StringIO(phylip_text3)
279 list3 = list(PhylipIterator(handle))
280 handle.close()
281 assert len(list3)==5
282
283 for i in range(0,5) :
284 list2[i].id == list3[i].id
285 list2[i].seq.tostring() == list3[i].seq.tostring()
286
287
288
289
290 phylip_text4=""" 5 42
291 Turkey AAGCTNGGGC ATTTCAGGGT
292 Salmo gairAAGCCTTGGC AGTGCAGGGT
293 H. SapiensACCGGTTGGC CGTTCAGGGT
294 Chimp AAACCCTTGC CGTTACGCTT
295 Gorilla AAACCCTTGC CGGTACGCTT
296
297 GAGCCCGGGC AATACAGGGT AT
298 GAGCCGTGGC CGGGCACGGT AT
299 ACAGGTTGGC CGTTCAGGGT AA
300 AAACCGAGGC CGGGACACTC AT
301 AAACCATTGC CGGTACGCTT AA"""
302
303
304
305 phylip_text5=""" 5 42
306 Turkey AAGCTNGGGC ATTTCAGGGT
307 GAGCCCGGGC AATACAGGGT AT
308 Salmo gairAAGCCTTGGC AGTGCAGGGT
309 GAGCCGTGGC CGGGCACGGT AT
310 H. SapiensACCGGTTGGC CGTTCAGGGT
311 ACAGGTTGGC CGTTCAGGGT AA
312 Chimp AAACCCTTGC CGTTACGCTT
313 AAACCGAGGC CGGGACACTC AT
314 Gorilla AAACCCTTGC CGGTACGCTT
315 AAACCATTGC CGGTACGCTT AA"""
316
317 phylip_text5a=""" 5 42
318 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
319 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
320 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
321 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
322 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
323
324 handle = StringIO(phylip_text4)
325 list4 = list(PhylipIterator(handle))
326 handle.close()
327 assert len(list4)==5
328
329 handle = StringIO(phylip_text5)
330 try :
331 list5 = list(PhylipIterator(handle))
332 assert len(list5)==5
333 print "That should have failed..."
334 except ValueError :
335 print "Evil multiline non-interlaced example failed as expected"
336 handle.close()
337
338 handle = StringIO(phylip_text5a)
339 list5 = list(PhylipIterator(handle))
340 handle.close()
341 assert len(list5)==5
342
343 for i in range(0,5) :
344 list4[i].id == list5[i].id
345 list4[i].seq.tostring() == list5[i].seq.tostring()
346
347
348
349 """
350 handle = StringIO(phylip_text)
351 out_handle=open("/tmp/test.phy","w")
352 writer = PhylipWriter(out_handle)
353 writer.write_file(PhylipIterator(handle))
354 out_handle.close()
355
356 print "---------------------"
357
358 print open("/tmp/test.phy").read()
359 """
360