1
2
3
4
5
6 """
7 This module contains a parser for the pairwise alignments produced by Bill
8 Pearson's FASTA tool, for use from the Bio.AlignIO interface where it is
9 refered to as the "fasta-m10" file format (as we only support the machine
10 readable output format selected with the -m 10 command line option).
11
12 This module does NOT cover the generic "fasta" file format originally
13 developed as an input format to the FASTA tools. The Bio.AlignIO and
14 Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
15 which can also be used to store a multiple sequence alignments.
16 """
17
18 from Bio.Align.Generic import Alignment
19 from Interfaces import AlignmentIterator
20
22 """Alignment iterator for the FASTA tool's pairwise alignment output.
23
24 This is for reading the pairwise alignments output by Bill Pearson's
25 FASTA program when called with the -m 10 command line option for machine
26 readable output. For more details about the FASTA tools, see the website
27 http://fasta.bioch.virginia.edu/ and the paper:
28
29 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
30
31 This class is intended to be used via the Bio.AlignIO.parse() function
32 by specifying the format as "fasta-m10" as shown in the following code:
33
34 from Bio import AlignIO
35 handle = ...
36 for a in AlignIO.parse(handle, "fasta-m10") :
37 assert len(a.get_all_seqs()) == 2, "Should be pairwise!"
38 print "Alignment length %i" % a.get_alignment_length()
39 for record in a :
40 print record.seq, record.name, record.id
41
42 Note that this is not a full blown parser for all the information
43 in the FASTA output - for example, most of the header and all of the
44 footer is ignored. Also, the alignments are not batched according to
45 the input queries.
46
47 Also note that there can be up to about 30 letters of flanking region
48 included in the raw FASTA output as contextual information. This is NOT
49 part of the alignment itself, and is not included in the resulting
50 Alignment objects returned.
51 """
52
54 """Reads from the handle to construct and return the next alignment.
55
56 This returns the pairwise alignment of query and match/library
57 sequences as an Alignment object containing two rows."""
58 handle = self.handle
59
60 try :
61
62
63 line = self._header
64 del self._header
65 except AttributeError:
66 line = handle.readline()
67 if not line:
68 return None
69
70 if line.startswith("#") :
71
72 line = self._skip_file_header(line)
73 if ">>>" in line and not line.startswith(">>>") :
74
75 self._query_descr = ""
76 self._query_header_annotation = {}
77
78 line = self._parse_query_header(line)
79 if not line :
80
81 return None
82 if ">>><<<" in line :
83
84 return None
85
86
87 assert line.startswith(">>") and not line.startswith(">>>"), line
88
89 query_seq_parts, match_seq_parts = [], []
90 query_annotation, match_annotation = {}, {}
91 match_descr = ""
92 alignment_annotation = {}
93
94
95
96 """
97 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
98 ; fa_frame: f
99 ; fa_initn: 52
100 ; fa_init1: 52
101 ; fa_opt: 70
102 ; fa_z-score: 105.5
103 ; fa_bits: 27.5
104 ; fa_expect: 0.082
105 ; sw_score: 70
106 ; sw_ident: 0.279
107 ; sw_sim: 0.651
108 ; sw_overlap: 43
109 """
110 if not line.startswith(">>") and not line.startswith(">>>") :
111 raise ValueError("Expected target line starting '>>'")
112 match_descr = line[2:].strip()
113
114 line = handle.readline()
115 line = self._parse_tag_section(line, alignment_annotation)
116 assert not line.startswith("; ")
117
118
119 """
120 >gi|10955265| ..
121 ; sq_len: 346
122 ; sq_offset: 1
123 ; sq_type: p
124 ; al_start: 197
125 ; al_stop: 238
126 ; al_display_start: 167
127 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
128 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
129 GEYFTENKPKYIIREIHQET
130 """
131 if not (line.startswith(">") and line.strip().endswith("..")):
132 raise ValueError("Expected line starting '>' and ending '..'")
133 assert self._query_descr.startswith(line[1:].split()[0])
134
135
136 line = handle.readline()
137 line = self._parse_tag_section(line, query_annotation)
138 assert not line.startswith("; ")
139
140
141 while not line.startswith(">") :
142 query_seq_parts.append(line.strip())
143 line = handle.readline()
144
145
146 """
147 >gi|152973545|ref|YP_001338596.1| ..
148 ; sq_len: 242
149 ; sq_type: p
150 ; al_start: 52
151 ; al_stop: 94
152 ; al_display_start: 22
153 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
154 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
155 QDFAFTRKMRREARQVEQSW
156 """
157
158 if not (line.startswith(">") and line.strip().endswith("..")):
159 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
160 assert match_descr.startswith(line[1:].split()[0])
161
162
163 line = handle.readline()
164 line = self._parse_tag_section(line, match_annotation)
165 assert not line.startswith("; ")
166
167
168 while not (line.startswith(">") or ">>>" in line):
169 match_seq_parts.append(line.strip())
170 line = handle.readline()
171 self._header = line
172
173
174
175 query_seq = "".join(query_seq_parts)
176 match_seq = "".join(match_seq_parts)
177 del query_seq_parts, match_seq_parts
178
179
180
181
182 query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
183 match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
184
185
186
187
188 if ("sq_offset" in query_annotation and query_annotation["sq_offset"]<>"1") \
189 or ("sq_offset" in match_annotation and match_annotation["sq_offset"]<>"1") :
190
191
192
193
194
195 pass
196
197 if len(query_align_seq) <> len(match_align_seq) :
198 raise ValueError("Problem parsing the alignment sequence coordinates")
199 if "sw_overlap" in alignment_annotation :
200 if int(alignment_annotation["sw_overlap"]) <> len(query_align_seq) :
201 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
202 % (alignment_annotation["sw_overlap"],
203 len(query_align_seq)))
204
205
206 alignment = Alignment(self.alphabet)
207
208
209
210 alignment._annotations = {}
211
212
213 for key, value in self._query_header_annotation.iteritems() :
214 alignment._annotations[key] = value
215 for key, value in alignment_annotation.iteritems() :
216 alignment._annotations[key] = value
217
218
219
220
221 alignment.add_sequence(self._query_descr, query_align_seq)
222 record = alignment.get_all_seqs()[-1]
223 assert record.id == self._query_descr or record.description == self._query_descr
224 assert record.seq.tostring() == query_align_seq
225 record.id = self._query_descr.split()[0].strip(",")
226 record.name = "query"
227 record.annotations["original_length"] = int(query_annotation["sq_len"])
228
229 alignment.add_sequence(match_descr, match_align_seq)
230 record = alignment.get_all_seqs()[-1]
231 assert record.id == match_descr or record.description == match_descr
232 assert record.seq.tostring() == match_align_seq
233 record.id = match_descr.split()[0].strip(",")
234 record.name = "match"
235 record.annotations["original_length"] = int(match_annotation["sq_len"])
236
237 return alignment
238
240 """Helper function for the main parsing code.
241
242 Skips over the file header region.
243 """
244
245 """
246 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa
247 FASTA searches a protein or DNA sequence data bank
248 version 35.03 Feb. 18, 2008
249 Please cite:
250 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
251
252 Query: NC_002127.faa
253 """
254
255
256
257 while ">>>" not in line :
258 line = self.handle.readline()
259 return line
260
262 """Helper function for the main parsing code.
263
264 Skips over the free format query header, extracting the tagged parameters.
265
266 If there are no hits for the current query, it is skipped entirely."""
267
268 """
269 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
270 Library: NC_009649.faa 45119 residues in 180 sequences
271
272 45119 residues in 180 sequences
273 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508
274 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32
275 Lambda= 0.191631
276 Algorithm: FASTA (3.5 Sept 2006) [optimized]
277 Parameters: BL50 matrix (15:-5) ktup: 2
278 join: 36, opt: 24, open/ext: -10/-2, width: 16
279 Scan time: 0.040
280
281 The best scores are: opt bits E(180)
282 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22
283 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93
284 """
285
286 """
287 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa
288 vs NC_002127.faa library
289
290 579 residues in 3 sequences
291 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100
292
293 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
294 join: 36, opt: 24, open/ext: -10/-2, width: 16
295 Scan time: 0.000
296 !! No library sequences with E() < 0.5
297 """
298
299 self._query_header_annotation = {}
300 self._query_descr = ""
301
302 assert ">>>" in line and not line.startswith(">>>")
303
304
305 line = self.handle.readline()
306
307 while not line.startswith(">>>") :
308
309 line = self.handle.readline()
310 if not line :
311 raise ValueError("Premature end of file!")
312 if ">>><<<" in line :
313
314
315 return line
316
317
318 """
319 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
320 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35
321 ; pg_ver: 35.03
322 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa
323 ; pg_name: FASTA
324 ; pg_ver: 3.5 Sept 2006
325 ; pg_matrix: BL50 (15:-5)
326 ; pg_open-ext: -10 -2
327 ; pg_ktup: 2
328 ; pg_optcut: 24
329 ; pg_cgap: 36
330 ; mp_extrap: 60000 500
331 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631
332 ; mp_KS: -0.0000 (N=1066338402) at 20
333 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized]
334 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16
335 """
336
337 assert line.startswith(">>>"), line
338 self._query_descr = line[3:].strip()
339
340
341 line = self.handle.readline()
342 line = self._parse_tag_section(line, self._query_header_annotation)
343 assert not line.startswith("; "), line
344 assert line.startswith(">>"), line
345 return line
346
347
349 """Helper function for the main parsing code.
350
351 To get the actual pairwise alignment sequences, we must first
352 translate the un-gapped sequence based coordinates into positions
353 in the gapped sequence (which may have a flanking region shown
354 using leading - characters). To date, I have never seen any
355 trailing flanking region shown in the m10 file, but the
356 following code should also cope with that.
357
358 Note that this code seems to work fine even when the "sq_offset"
359 entries are prsent as a result of using the -X command line option.
360 """
361 align_stripped = alignment_seq_with_flanking.strip("-")
362 start = int(annotation['al_start']) \
363 - int(annotation['al_display_start'])
364 end = int(annotation['al_stop']) \
365 - int(annotation['al_display_start']) \
366 + align_stripped.count("-") + 1
367 return align_stripped[start:end]
368
369
371 """Helper function for the main parsing code.
372
373 line - supply line just read from the handle containing the start of
374 the tagged section.
375 dictionary - where to record the tagged values
376
377 Returns a string, the first line following the tagged section."""
378 if not line.startswith("; ") :
379 raise ValueError("Expected line starting '; '")
380 while line.startswith("; ") :
381 tag, value = line[2:].strip().split(": ",1)
382
383
384
385
386
387 dictionary[tag] = value
388 line = self.handle.readline()
389 return line
390
391 if __name__ == "__main__" :
392 print "Running a quick self-test"
393
394
395 simple_example = \
396 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
397 FASTA searches a protein or DNA sequence data bank
398 version 34.26 January 12, 2007
399 Please cite:
400 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
401
402 Query library NC_002127.faa vs NC_009649.faa library
403 searching NC_009649.faa library
404
405 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
406 vs NC_009649.faa library
407
408 45119 residues in 180 sequences
409 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
410 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25
411 Lambda= 0.175043
412
413 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
414 join: 36, opt: 24, open/ext: -10/-2, width: 16
415 Scan time: 0.000
416 The best scores are: opt bits E(180)
417 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58
418 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99
419
420 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
421 ; pg_name: /opt/fasta/fasta34
422 ; pg_ver: 34.26
423 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
424 ; pg_name: FASTA
425 ; pg_ver: 3.5 Sept 2006
426 ; pg_matrix: BL50 (15:-5)
427 ; pg_open-ext: -10 -2
428 ; pg_ktup: 2
429 ; pg_optcut: 24
430 ; pg_cgap: 36
431 ; mp_extrap: 60000 180
432 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043
433 ; mp_KS: -0.0000 (N=0) at 8159228
434 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
435 ; fa_frame: f
436 ; fa_initn: 65
437 ; fa_init1: 43
438 ; fa_opt: 71
439 ; fa_z-score: 90.3
440 ; fa_bits: 24.9
441 ; fa_expect: 0.58
442 ; sw_score: 71
443 ; sw_ident: 0.250
444 ; sw_sim: 0.574
445 ; sw_overlap: 108
446 >gi|10955263| ..
447 ; sq_len: 107
448 ; sq_offset: 1
449 ; sq_type: p
450 ; al_start: 5
451 ; al_stop: 103
452 ; al_display_start: 1
453 --------------------------MTKRSGSNT-RRRAISRPVRLTAE
454 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
455 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
456 >gi|152973457|ref|YP_001338508.1| ..
457 ; sq_len: 931
458 ; sq_type: p
459 ; al_start: 96
460 ; al_stop: 195
461 ; al_display_start: 66
462 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
463 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
464 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
465 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
466 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
467 ; fa_frame: f
468 ; fa_initn: 33
469 ; fa_init1: 33
470 ; fa_opt: 63
471 ; fa_z-score: 86.1
472 ; fa_bits: 23.1
473 ; fa_expect: 0.99
474 ; sw_score: 63
475 ; sw_ident: 0.266
476 ; sw_sim: 0.656
477 ; sw_overlap: 64
478 >gi|10955263| ..
479 ; sq_len: 107
480 ; sq_offset: 1
481 ; sq_type: p
482 ; al_start: 32
483 ; al_stop: 94
484 ; al_display_start: 2
485 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
486 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
487 LSRLMAD
488 >gi|152973588|ref|YP_001338639.1| ..
489 ; sq_len: 459
490 ; sq_type: p
491 ; al_start: 191
492 ; al_stop: 248
493 ; al_display_start: 161
494 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
495 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
496 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
497 SNKALKSQISALLSSIQNKAVADEKLTDQE
498 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
499 vs NC_009649.faa library
500
501 45119 residues in 180 sequences
502 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
503 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25
504 Lambda= 0.179384
505
506 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
507 join: 36, opt: 24, open/ext: -10/-2, width: 16
508 Scan time: 0.000
509 The best scores are: opt bits E(180)
510 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29
511
512 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
513 ; pg_name: /opt/fasta/fasta34
514 ; pg_ver: 34.26
515 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
516 ; pg_name: FASTA
517 ; pg_ver: 3.5 Sept 2006
518 ; pg_matrix: BL50 (15:-5)
519 ; pg_open-ext: -10 -2
520 ; pg_ktup: 2
521 ; pg_optcut: 24
522 ; pg_cgap: 36
523 ; mp_extrap: 60000 180
524 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384
525 ; mp_KS: -0.0000 (N=0) at 8159228
526 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
527 ; fa_frame: f
528 ; fa_initn: 50
529 ; fa_init1: 50
530 ; fa_opt: 58
531 ; fa_z-score: 95.8
532 ; fa_bits: 22.9
533 ; fa_expect: 0.29
534 ; sw_score: 58
535 ; sw_ident: 0.289
536 ; sw_sim: 0.632
537 ; sw_overlap: 38
538 >gi|10955264| ..
539 ; sq_len: 126
540 ; sq_offset: 1
541 ; sq_type: p
542 ; al_start: 1
543 ; al_stop: 38
544 ; al_display_start: 1
545 ------------------------------MKKDKKYQIEAIKNKDKTLF
546 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
547 NGEKFSSYTLNKVTKTDEYN
548 >gi|152973462|ref|YP_001338513.1| ..
549 ; sq_len: 101
550 ; sq_type: p
551 ; al_start: 44
552 ; al_stop: 81
553 ; al_display_start: 14
554 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
555 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
556 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
557 vs NC_009649.faa library
558
559 45119 residues in 180 sequences
560 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
561 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25
562 Lambda= 0.210386
563
564 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
565 join: 37, opt: 25, open/ext: -10/-2, width: 16
566 Scan time: 0.020
567 The best scores are: opt bits E(180)
568 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082
569
570 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
571 ; pg_name: /opt/fasta/fasta34
572 ; pg_ver: 34.26
573 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
574 ; pg_name: FASTA
575 ; pg_ver: 3.5 Sept 2006
576 ; pg_matrix: BL50 (15:-5)
577 ; pg_open-ext: -10 -2
578 ; pg_ktup: 2
579 ; pg_optcut: 25
580 ; pg_cgap: 37
581 ; mp_extrap: 60000 180
582 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386
583 ; mp_KS: -0.0000 (N=0) at 8159228
584 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
585 ; fa_frame: f
586 ; fa_initn: 52
587 ; fa_init1: 52
588 ; fa_opt: 70
589 ; fa_z-score: 105.5
590 ; fa_bits: 27.5
591 ; fa_expect: 0.082
592 ; sw_score: 70
593 ; sw_ident: 0.279
594 ; sw_sim: 0.651
595 ; sw_overlap: 43
596 >gi|10955265| ..
597 ; sq_len: 346
598 ; sq_offset: 1
599 ; sq_type: p
600 ; al_start: 197
601 ; al_stop: 238
602 ; al_display_start: 167
603 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
604 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
605 GEYFTENKPKYIIREIHQET
606 >gi|152973545|ref|YP_001338596.1| ..
607 ; sq_len: 242
608 ; sq_type: p
609 ; al_start: 52
610 ; al_stop: 94
611 ; al_display_start: 22
612 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
613 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
614 QDFAFTRKMRREARQVEQSW
615 >>><<<
616
617
618 579 residues in 3 query sequences
619 45119 residues in 180 library sequences
620 Scomplib [34.26]
621 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
622 Total Scan time: 0.020 Total Display time: 0.010
623
624 Function used was FASTA [version 34.26 January 12, 2007]
625
626 """
627
628
629 from StringIO import StringIO
630
631 alignments = list(FastaM10Iterator(StringIO(simple_example)))
632 assert len(alignments) == 4, len(alignments)
633 assert len(alignments[0].get_all_seqs()) == 2
634 for a in alignments :
635 print "Alignment %i sequences of length %i" \
636 % (len(a.get_all_seqs()), a.get_alignment_length())
637 for r in a :
638 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"])
639
640 print "Done"
641
642 import os
643 path = "../../Tests/Fasta/"
644 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"]
645 files.sort()
646 for filename in files :
647 if os.path.splitext(filename)[-1] == ".m10" :
648 print
649 print filename
650 print "="*len(filename)
651 for a in FastaM10Iterator(open(os.path.join(path,filename))):
652 print a
653