1
2
3
4
5
6 """Multiple sequence alignment input/output as Alignment objects.
7
8 The Bio.AlignIO interface is deliberately very similar to Bio.SeqIO, and in
9 fact the two are connected internally. Both modules use the same set of file
10 format names (lower case strings). From the user's perspective, you can read
11 in a PHYLIP file containing one or more alignments using Bio.AlignIO, or you
12 can read in the sequences within these alignmenta using Bio.SeqIO.
13
14 Bio.AlignIO is also documented at U{http://biopython.org/wiki/AlignIO} and by
15 a whole chapter in our tutorial:
16 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
17 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
18
19 Input
20 =====
21 For the typical special case when your file or handle contains one and only
22 one alignment, use the function Bio.AlignIO.read(). This takes an input file
23 handle, format string and optional number of sequences per alignment. It will
24 return a single Alignment object (or raise an exception if there isn't just
25 one alignment):
26
27 >>> from Bio import AlignIO
28 >>> handle = open("Phylip/interlaced.phy", "rU")
29 >>> align = AlignIO.read(handle, "phylip")
30 >>> handle.close()
31 >>> print align
32 SingleLetterAlphabet() alignment with 3 rows and 384 columns
33 -----MKVILLFVLAVFTVFVSS---------------RGIPPE...I-- CYS1_DICDI
34 MAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTL...VAA ALEU_HORVU
35 ------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSL...PLV CATH_HUMAN
36
37 For the general case, when the handle could contain any number of alignments,
38 use the function Bio.AlignIO.parse(...) which takes the same arguments, but
39 returns an iterator giving Alignment objects (typically used in a for loop).
40 If you want random access to the alignments by number, turn this into a list:
41
42 >>> from Bio import AlignIO
43 >>> handle = open("Emboss/needle.txt", "rU")
44 >>> alignments = list(AlignIO.parse(handle, "emboss"))
45 >>> print alignments[2]
46 SingleLetterAlphabet() alignment with 2 rows and 120 columns
47 -KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKER...--- ref_rec
48 LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHP...HKE gi|94967506|receiver
49
50 Most alignment file formats can be concatenated so as to hold as many
51 different multiple sequence alignments as possible. One common example
52 is the output of the tool seqboot in the PHLYIP suite. Sometimes there
53 can be a file header and footer, as seen in the EMBOSS alignment output.
54
55 Output
56 ======
57 Use the function Bio.AlignIO.write(...), which takes a complete set of
58 Alignment objects (either as a list, or an iterator), an output file handle
59 and of course the file format::
60
61 from Bio import AlignIO
62 alignments = ...
63 handle = open("example.faa", "w")
64 alignment = SeqIO.write(alignments, handle, "fasta")
65 handle.close()
66
67 In general, you are expected to call this function once (with all your
68 alignments) and then close the file handle. However, for file formats
69 like PHYLIP where multiple alignments are stored sequentially (with no file
70 header and footer), then multiple calls to the write function should work as
71 expected.
72
73 File Formats
74 ============
75 When specifying the file format, use lowercase strings. The same format
76 names are also used in Bio.SeqIO and include the following:
77
78 - clustal - Ouput from Clustal W or X, see also the module Bio.Clustalw
79 which can be used to run the command line tool from Biopython.
80 - emboss - EMBOSS tools' "pairs" and "simple" alignment formats.
81 - fasta - The generic sequence file format where each record starts with
82 an identifer line starting with a ">" character, followed by
83 lines of sequence.
84 - fasta-m10 - For the pairswise alignments output by Bill Pearson's FASTA
85 tools when used with the -m 10 command line option for machine
86 readable output.
87 - ig - The IntelliGenetics file format, apparently the same as the
88 MASE alignment format.
89 - nexus - Output from NEXUS, see also the module Bio.Nexus which can also
90 read any phylogenetic trees in these files.
91 - phylip - Used by the PHLIP tools.
92 - stockholm - A richly annotated alignment file format used by PFAM.
93
94 Note that while Bio.AlignIO can read all the above file formats, it cannot
95 write to all of them.
96
97 You can also use any file format supported by Bio.SeqIO, such as "fasta" or
98 "ig" (which are listed above), PROVIDED the sequences in your file are all the
99 same length.
100 """
101 __docformat__ = "epytext en"
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 import os
118
119 from StringIO import StringIO
120 from Bio.Seq import Seq
121 from Bio.SeqRecord import SeqRecord
122 from Bio.Align.Generic import Alignment
123 from Bio.Alphabet import Alphabet, AlphabetEncoder, _get_base_alphabet
124
125 import StockholmIO
126 import ClustalIO
127 import NexusIO
128 import PhylipIO
129 import EmbossIO
130 import FastaIO
131
132
133
134
135 _FormatToIterator ={
136 "clustal" : ClustalIO.ClustalIterator,
137 "emboss" : EmbossIO.EmbossIterator,
138 "fasta-m10" : FastaIO.FastaM10Iterator,
139 "nexus" : NexusIO.NexusIterator,
140 "phylip" : PhylipIO.PhylipIterator,
141 "stockholm" : StockholmIO.StockholmIterator,
142 }
143
144 _FormatToWriter ={
145
146 "nexus" : NexusIO.NexusWriter,
147 "phylip" : PhylipIO.PhylipWriter,
148 "stockholm" : StockholmIO.StockholmWriter,
149 "clustal" : ClustalIO.ClustalWriter,
150 }
151
152 -def write(alignments, handle, format) :
153 """Write complete set of alignments to a file.
154
155 Arguments:
156 - sequences - A list (or iterator) of Alignment objects
157 - handle - File handle object to write to
158 - format - lower case string describing the file format to write.
159
160 You should close the handle after calling this function.
161
162 Returns the number of alignments written (as an integer).
163 """
164 from Bio import SeqIO
165
166
167 if isinstance(handle, basestring) :
168 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
169 if not isinstance(format, basestring) :
170 raise TypeError("Need a string for the file format (lower case)")
171 if not format :
172 raise ValueError("Format required (lower case string)")
173 if format != format.lower() :
174 raise ValueError("Format string '%s' should be lower case" % format)
175 if isinstance(alignments, Alignment) :
176 raise TypeError("Need an Alignment list/iterator, not just a single Alignment")
177
178
179 if format in _FormatToIterator :
180 writer_class = _FormatToWriter[format]
181 count = writer_class(handle).write_file(alignments)
182 elif format in SeqIO._FormatToWriter :
183
184
185 count = 0
186 for alignment in alignments :
187 if not isinstance(alignment, Alignment) :
188 raise TypeError("Expect a list or iterator of Alignment objects.")
189 SeqIO.write(alignment, handle, format)
190 count += 1
191 elif format in _FormatToIterator or format in SeqIO._FormatToIterator :
192 raise ValueError("Reading format '%s' is supported, but not writing" \
193 % format)
194 else :
195 raise ValueError("Unknown format '%s'" % format)
196
197 assert isinstance(count, int), "Internal error - the underlying writer " \
198 + " should have returned the alignment count, not %s" % repr(count)
199 return count
200
201
203 """Uses Bio.SeqIO to create an Alignment iterator (PRIVATE).
204
205 Arguments:
206 - handle - handle to the file.
207 - format - string describing the file format.
208 - alphabet - optional Alphabet object, useful when the sequence type
209 cannot be automatically inferred from the file itself
210 (e.g. fasta, phylip, clustal)
211 - seq_count - Optional integer, number of sequences expected in each
212 alignment. Recommended for fasta format files.
213
214 If count is omitted (default) then all the sequences in
215 the file are combined into a single Alignment.
216 """
217 from Bio import SeqIO
218 assert format in SeqIO._FormatToIterator
219
220 if seq_count :
221
222 seq_record_iterator = SeqIO.parse(handle, format, alphabet)
223
224 records = []
225 for record in seq_record_iterator :
226 records.append(record)
227 if len(records) == seq_count :
228 yield SeqIO.to_alignment(records)
229 records = []
230 if len(records) > 0 :
231 raise ValueError("Check seq_count argument, not enough sequences?")
232 else :
233
234
235 records = list(SeqIO.parse(handle, format, alphabet))
236 if records :
237 yield SeqIO.to_alignment(records)
238 else :
239
240 pass
241
261
262 -def parse(handle, format, seq_count=None, alphabet=None) :
263 """Turns a sequence file into an iterator returning Alignment objects.
264
265 Arguments:
266 - handle - handle to the file.
267 - format - string describing the file format.
268 - alphabet - optional Alphabet object, useful when the sequence type
269 cannot be automatically inferred from the file itself
270 (e.g. fasta, phylip, clustal)
271 - seq_count - Optional integer, number of sequences expected in each
272 alignment. Recommended for fasta format files.
273
274 If you have the file name in a string 'filename', use:
275
276 >>> from Bio import AlignIO
277 >>> filename = "Emboss/needle.txt"
278 >>> format = "emboss"
279 >>> for alignment in AlignIO.parse(open(filename,"rU"), format) :
280 ... print "Alignment of length", alignment.get_alignment_length()
281 Alignment of length 124
282 Alignment of length 119
283 Alignment of length 120
284 Alignment of length 118
285 Alignment of length 125
286
287 If you have a string 'data' containing the file contents, use:
288
289 from Bio import AlignIO
290 from StringIO import StringIO
291 my_iterator = AlignIO.parse(StringIO(data), format)
292
293 Use the Bio.AlignIO.read() function when you expect a single record only.
294 """
295 from Bio import SeqIO
296
297
298 if isinstance(handle, basestring) :
299 raise TypeError("Need a file handle, not a string (i.e. not a filename)")
300 if not isinstance(format, basestring) :
301 raise TypeError("Need a string for the file format (lower case)")
302 if not format :
303 raise ValueError("Format required (lower case string)")
304 if format != format.lower() :
305 raise ValueError("Format string '%s' should be lower case" % format)
306 if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
307 isinstance(alphabet, AlphabetEncoder)) :
308 raise ValueError("Invalid alphabet, %s" % repr(alphabet))
309
310
311 if format in _FormatToIterator :
312 iterator_generator = _FormatToIterator[format]
313 if alphabet is None :
314 return iterator_generator(handle, seq_count)
315 try :
316
317 return iterator_generator(handle, seq_count, alphabet=alphabet)
318 except TypeError :
319
320 return _force_alphabet(iterator_generator(handle, seq_count), alphabet)
321
322 elif format in SeqIO._FormatToIterator :
323
324 return _SeqIO_to_alignment_iterator(handle, format,
325 alphabet=alphabet,
326 seq_count=seq_count)
327 else :
328 raise ValueError("Unknown format '%s'" % format)
329
330 -def read(handle, format, seq_count=None, alphabet=None) :
331 """Turns an alignment file into a single Alignment object.
332
333 Arguments:
334 - handle - handle to the file.
335 - format - string describing the file format.
336 - alphabet - optional Alphabet object, useful when the sequence type
337 cannot be automatically inferred from the file itself
338 (e.g. fasta, phylip, clustal)
339 - seq_count - Optional integer, number of sequences expected in each
340 alignment. Recommended for fasta format files.
341
342 If the handle contains no alignments, or more than one alignment,
343 an exception is raised. For example, using a PFAM/Stockholm file
344 containing one alignment:
345
346 >>> from Bio import AlignIO
347 >>> filename = "Clustalw/protein.aln"
348 >>> format = "clustal"
349 >>> alignment = AlignIO.read(open(filename, "rU"), format)
350 >>> print "Alignment of length", alignment.get_alignment_length()
351 Alignment of length 411
352
353 If however you want the first alignment from a file containing
354 multiple alignments this function would raise an exception.
355
356 >>> from Bio import AlignIO
357 >>> filename = "Emboss/needle.txt"
358 >>> format = "emboss"
359 >>> alignment = AlignIO.read(open(filename, "rU"), format)
360 Traceback (most recent call last):
361 ...
362 ValueError: More than one record found in handle
363
364 Instead use:
365
366 >>> from Bio import AlignIO
367 >>> filename = "Emboss/needle.txt"
368 >>> format = "emboss"
369 >>> alignment = AlignIO.parse(open(filename, "rU"), format).next()
370 >>> print "First alignment has length", alignment.get_alignment_length()
371 First alignment has length 124
372
373 You must use the Bio.AlignIO.parse() function if you want to read multiple
374 records from the handle.
375 """
376 iterator = parse(handle, format, seq_count, alphabet)
377 try :
378 first = iterator.next()
379 except StopIteration :
380 first = None
381 if first is None :
382 raise ValueError("No records found in handle")
383 try :
384 second = iterator.next()
385 except StopIteration :
386 second = None
387 if second is not None :
388 raise ValueError("More than one record found in handle")
389 if seq_count :
390 assert len(first.get_all_seqs())==seq_count
391 return first
392
394 """Run the Bio.AlignIO module's doctests.
395
396 This will try and locate the unit tests directory, and run the doctests
397 from there in order that the relative paths used in the examples work.
398 """
399 import doctest
400 import os
401 if os.path.isdir(os.path.join("..","..","Tests")) :
402 print "Runing doctests..."
403 cur_dir = os.path.abspath(os.curdir)
404 os.chdir(os.path.join("..","..","Tests"))
405 doctest.testmod()
406 os.chdir(cur_dir)
407 del cur_dir
408 print "Done"
409
410 if __name__ == "__main__" :
411 _test()
412