1
2
3
4
5
6 """
7 Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11
12 This module contains a parser for the pairwise alignments produced by Bill
13 Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
14 refered to as the "fasta-m10" file format (as we only support the machine
15 readable output format selected with the -m 10 command line option).
16
17 This module does NOT cover the generic "fasta" file format originally
18 developed as an input format to the FASTA tools. The Bio.AlignIO and
19 Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
20 which can also be used to store a multiple sequence alignments.
21 """
22
23 from Bio.Align.Generic import Alignment
24 from Interfaces import AlignmentIterator
25 from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein
26 from Bio.Alphabet import Gapped
27
28
30 """Alignment iterator for the FASTA tool's pairwise alignment output.
31
32 This is for reading the pairwise alignments output by Bill Pearson's
33 FASTA program when called with the -m 10 command line option for machine
34 readable output. For more details about the FASTA tools, see the website
35 http://fasta.bioch.virginia.edu/ and the paper:
36
37 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
38
39 This class is intended to be used via the Bio.AlignIO.parse() function
40 by specifying the format as "fasta-m10" as shown in the following code:
41
42 from Bio import AlignIO
43 handle = ...
44 for a in AlignIO.parse(handle, "fasta-m10") :
45 assert len(a.get_all_seqs()) == 2, "Should be pairwise!"
46 print "Alignment length %i" % a.get_alignment_length()
47 for record in a :
48 print record.seq, record.name, record.id
49
50 Note that this is not a full blown parser for all the information
51 in the FASTA output - for example, most of the header and all of the
52 footer is ignored. Also, the alignments are not batched according to
53 the input queries.
54
55 Also note that there can be up to about 30 letters of flanking region
56 included in the raw FASTA output as contextual information. This is NOT
57 part of the alignment itself, and is not included in the resulting
58 Alignment objects returned.
59 """
60
62 """Reads from the handle to construct and return the next alignment.
63
64 This returns the pairwise alignment of query and match/library
65 sequences as an Alignment object containing two rows."""
66 handle = self.handle
67
68 try :
69
70
71 line = self._header
72 del self._header
73 except AttributeError:
74 line = handle.readline()
75 if not line:
76 return None
77
78 if line.startswith("#") :
79
80 line = self._skip_file_header(line)
81 while ">>>" in line and not line.startswith(">>>") :
82
83 self._query_descr = ""
84 self._query_header_annotation = {}
85
86 line = self._parse_query_header(line)
87
88 if not line :
89
90 return None
91 if ">>><<<" in line :
92
93 return None
94
95
96
97 assert line[0:2] == ">>" and not line[2] == ">", line
98
99 query_seq_parts, match_seq_parts = [], []
100 query_annotation, match_annotation = {}, {}
101 match_descr = ""
102 alignment_annotation = {}
103
104
105
106 """
107 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
108 ; fa_frame: f
109 ; fa_initn: 52
110 ; fa_init1: 52
111 ; fa_opt: 70
112 ; fa_z-score: 105.5
113 ; fa_bits: 27.5
114 ; fa_expect: 0.082
115 ; sw_score: 70
116 ; sw_ident: 0.279
117 ; sw_sim: 0.651
118 ; sw_overlap: 43
119 """
120 if (not line[0:2] == ">>") or line[0:3] == ">>>" :
121 raise ValueError("Expected target line starting '>>'")
122 match_descr = line[2:].strip()
123
124 line = handle.readline()
125 line = self._parse_tag_section(line, alignment_annotation)
126 assert not line[0:2] == "; "
127
128
129 """
130 >gi|10955265| ..
131 ; sq_len: 346
132 ; sq_offset: 1
133 ; sq_type: p
134 ; al_start: 197
135 ; al_stop: 238
136 ; al_display_start: 167
137 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
138 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
139 GEYFTENKPKYIIREIHQET
140 """
141 if not (line[0] == ">" and line.strip().endswith("..")):
142 raise ValueError("Expected line starting '>' and ending '..'")
143 assert self._query_descr.startswith(line[1:].split(None,1)[0])
144
145
146 line = handle.readline()
147 line = self._parse_tag_section(line, query_annotation)
148 assert not line[0:2] == "; "
149
150
151 while not line[0] == ">" :
152 query_seq_parts.append(line.strip())
153 line = handle.readline()
154
155
156 """
157 >gi|152973545|ref|YP_001338596.1| ..
158 ; sq_len: 242
159 ; sq_type: p
160 ; al_start: 52
161 ; al_stop: 94
162 ; al_display_start: 22
163 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
164 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
165 QDFAFTRKMRREARQVEQSW
166 """
167
168 if not (line[0] == ">" and line.strip().endswith("..")):
169 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
170 assert match_descr.startswith(line[1:].split(None,1)[0])
171
172
173 line = handle.readline()
174 line = self._parse_tag_section(line, match_annotation)
175 assert not line[0:2] == "; "
176
177
178
179 """
180 ; al_cons:
181 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...:
182 etc
183 """
184 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
185 match_seq_parts.append(line.strip())
186 line = handle.readline()
187 if line[0:2] == "; " :
188 assert line.strip() == "; al_cons:"
189 align_consensus_parts = []
190 line = handle.readline()
191 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
192 align_consensus_parts.append(line.strip())
193 line = handle.readline()
194
195 align_consensus = "".join(align_consensus_parts)
196 del align_consensus_parts
197 assert not line[0:2] == "; "
198 else :
199 align_consensus = None
200 assert (line[0] == ">" or ">>>" in line)
201 self._header = line
202
203
204
205 query_seq = "".join(query_seq_parts)
206 match_seq = "".join(match_seq_parts)
207 del query_seq_parts, match_seq_parts
208
209
210
211
212 query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
213 match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
214
215
216
217
218
219 if len(query_align_seq) != len(match_align_seq) :
220 raise ValueError("Problem parsing the alignment sequence coordinates, "
221 "following should be the same length but are not:\n"
222 "%s - len %i\n%s - len %i" % (query_align_seq,
223 len(query_align_seq),
224 match_align_seq,
225 len(match_align_seq)))
226 if "sw_overlap" in alignment_annotation :
227 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
228 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
229 % (alignment_annotation["sw_overlap"],
230 len(query_align_seq)))
231
232
233 alphabet = self.alphabet
234 alignment = Alignment(alphabet)
235
236
237
238 alignment._annotations = {}
239
240
241 for key, value in self._query_header_annotation.iteritems() :
242 alignment._annotations[key] = value
243 for key, value in alignment_annotation.iteritems() :
244 alignment._annotations[key] = value
245
246
247
248
249 alignment.add_sequence(self._query_descr, query_align_seq)
250 record = alignment.get_all_seqs()[-1]
251 assert record.id == self._query_descr or record.description == self._query_descr
252
253 record.id = self._query_descr.split(None,1)[0].strip(",")
254 record.name = "query"
255 record.annotations["original_length"] = int(query_annotation["sq_len"])
256
257
258
259
260 if alphabet == single_letter_alphabet and "sq_type" in query_annotation :
261 if query_annotation["sq_type"] == "D" :
262 record.seq.alphabet = generic_dna
263 elif query_annotation["sq_type"] == "p" :
264 record.seq.alphabet = generic_protein
265 if "-" in query_align_seq :
266 if not hasattr(record.seq.alphabet,"gap_char") :
267 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
268
269 alignment.add_sequence(match_descr, match_align_seq)
270 record = alignment.get_all_seqs()[-1]
271 assert record.id == match_descr or record.description == match_descr
272
273 record.id = match_descr.split(None,1)[0].strip(",")
274 record.name = "match"
275 record.annotations["original_length"] = int(match_annotation["sq_len"])
276
277
278 if alphabet == single_letter_alphabet and "sq_type" in match_annotation :
279 if match_annotation["sq_type"] == "D" :
280 record.seq.alphabet = generic_dna
281 elif match_annotation["sq_type"] == "p" :
282 record.seq.alphabet = generic_protein
283 if "-" in match_align_seq :
284 if not hasattr(record.seq.alphabet,"gap_char") :
285 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
286
287 return alignment
288
290 """Helper function for the main parsing code.
291
292 Skips over the file header region.
293 """
294
295 """
296 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa
297 FASTA searches a protein or DNA sequence data bank
298 version 35.03 Feb. 18, 2008
299 Please cite:
300 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
301
302 Query: NC_002127.faa
303 """
304
305
306
307 while ">>>" not in line :
308 line = self.handle.readline()
309 return line
310
312 """Helper function for the main parsing code.
313
314 Skips over the free format query header, extracting the tagged parameters.
315
316 If there are no hits for the current query, it is skipped entirely."""
317
318 """
319 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
320 Library: NC_009649.faa 45119 residues in 180 sequences
321
322 45119 residues in 180 sequences
323 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508
324 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32
325 Lambda= 0.191631
326 Algorithm: FASTA (3.5 Sept 2006) [optimized]
327 Parameters: BL50 matrix (15:-5) ktup: 2
328 join: 36, opt: 24, open/ext: -10/-2, width: 16
329 Scan time: 0.040
330
331 The best scores are: opt bits E(180)
332 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22
333 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93
334 """
335
336
337 """
338 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa
339 vs NC_002127.faa library
340
341 579 residues in 3 sequences
342 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100
343
344 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
345 join: 36, opt: 24, open/ext: -10/-2, width: 16
346 Scan time: 0.000
347 !! No library sequences with E() < 0.5
348 """
349
350 self._query_header_annotation = {}
351 self._query_descr = ""
352
353 assert ">>>" in line and not line[0:3] == ">>>"
354
355
356 line = self.handle.readline()
357
358 while not line[0:3] == ">>>" :
359
360 line = self.handle.readline()
361 if not line :
362 raise ValueError("Premature end of file!")
363 if ">>><<<" in line :
364
365
366 return line
367
368
369 """
370 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
371 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35
372 ; pg_ver: 35.03
373 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa
374 ; pg_name: FASTA
375 ; pg_ver: 3.5 Sept 2006
376 ; pg_matrix: BL50 (15:-5)
377 ; pg_open-ext: -10 -2
378 ; pg_ktup: 2
379 ; pg_optcut: 24
380 ; pg_cgap: 36
381 ; mp_extrap: 60000 500
382 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631
383 ; mp_KS: -0.0000 (N=1066338402) at 20
384 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized]
385 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16
386 """
387
388 assert line[0:3] == ">>>", line
389 self._query_descr = line[3:].strip()
390
391
392 line = self.handle.readline()
393 line = self._parse_tag_section(line, self._query_header_annotation)
394 assert not line[0:2] == "; ", line
395 assert line[0:2] == ">>" or ">>>" in line, line
396 return line
397
398
400 """Helper function for the main parsing code.
401
402 To get the actual pairwise alignment sequences, we must first
403 translate the un-gapped sequence based coordinates into positions
404 in the gapped sequence (which may have a flanking region shown
405 using leading - characters). To date, I have never seen any
406 trailing flanking region shown in the m10 file, but the
407 following code should also cope with that.
408
409 Note that this code seems to work fine even when the "sq_offset"
410 entries are prsent as a result of using the -X command line option.
411 """
412 align_stripped = alignment_seq_with_flanking.strip("-")
413 display_start = int(annotation['al_display_start'])
414 if int(annotation['al_start']) <= int(annotation['al_stop']) :
415 start = int(annotation['al_start']) \
416 - display_start
417 end = int(annotation['al_stop']) \
418 - display_start \
419 + align_stripped.count("-") + 1
420 else :
421
422 start = display_start \
423 - int(annotation['al_start'])
424 end = display_start \
425 - int(annotation['al_stop']) \
426 + align_stripped.count("-") + 1
427 assert 0 <= start and start < end and end <= len(align_stripped), \
428 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \
429 % (alignment_seq_with_flanking, start, end, annotation)
430 return align_stripped[start:end]
431
432
434 """Helper function for the main parsing code.
435
436 line - supply line just read from the handle containing the start of
437 the tagged section.
438 dictionary - where to record the tagged values
439
440 Returns a string, the first line following the tagged section."""
441 if not line[0:2] == "; " :
442 raise ValueError("Expected line starting '; '")
443 while line[0:2] == "; " :
444 tag, value = line[2:].strip().split(": ",1)
445
446
447
448
449
450
451 dictionary[tag] = value
452 line = self.handle.readline()
453 return line
454
455 if __name__ == "__main__" :
456 print "Running a quick self-test"
457
458
459 simple_example = \
460 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
461 FASTA searches a protein or DNA sequence data bank
462 version 34.26 January 12, 2007
463 Please cite:
464 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
465
466 Query library NC_002127.faa vs NC_009649.faa library
467 searching NC_009649.faa library
468
469 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
470 vs NC_009649.faa library
471
472 45119 residues in 180 sequences
473 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
474 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25
475 Lambda= 0.175043
476
477 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
478 join: 36, opt: 24, open/ext: -10/-2, width: 16
479 Scan time: 0.000
480 The best scores are: opt bits E(180)
481 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58
482 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99
483
484 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
485 ; pg_name: /opt/fasta/fasta34
486 ; pg_ver: 34.26
487 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
488 ; pg_name: FASTA
489 ; pg_ver: 3.5 Sept 2006
490 ; pg_matrix: BL50 (15:-5)
491 ; pg_open-ext: -10 -2
492 ; pg_ktup: 2
493 ; pg_optcut: 24
494 ; pg_cgap: 36
495 ; mp_extrap: 60000 180
496 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043
497 ; mp_KS: -0.0000 (N=0) at 8159228
498 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
499 ; fa_frame: f
500 ; fa_initn: 65
501 ; fa_init1: 43
502 ; fa_opt: 71
503 ; fa_z-score: 90.3
504 ; fa_bits: 24.9
505 ; fa_expect: 0.58
506 ; sw_score: 71
507 ; sw_ident: 0.250
508 ; sw_sim: 0.574
509 ; sw_overlap: 108
510 >gi|10955263| ..
511 ; sq_len: 107
512 ; sq_offset: 1
513 ; sq_type: p
514 ; al_start: 5
515 ; al_stop: 103
516 ; al_display_start: 1
517 --------------------------MTKRSGSNT-RRRAISRPVRLTAE
518 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
519 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
520 >gi|152973457|ref|YP_001338508.1| ..
521 ; sq_len: 931
522 ; sq_type: p
523 ; al_start: 96
524 ; al_stop: 195
525 ; al_display_start: 66
526 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
527 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
528 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
529 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
530 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
531 ; fa_frame: f
532 ; fa_initn: 33
533 ; fa_init1: 33
534 ; fa_opt: 63
535 ; fa_z-score: 86.1
536 ; fa_bits: 23.1
537 ; fa_expect: 0.99
538 ; sw_score: 63
539 ; sw_ident: 0.266
540 ; sw_sim: 0.656
541 ; sw_overlap: 64
542 >gi|10955263| ..
543 ; sq_len: 107
544 ; sq_offset: 1
545 ; sq_type: p
546 ; al_start: 32
547 ; al_stop: 94
548 ; al_display_start: 2
549 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
550 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
551 LSRLMAD
552 >gi|152973588|ref|YP_001338639.1| ..
553 ; sq_len: 459
554 ; sq_type: p
555 ; al_start: 191
556 ; al_stop: 248
557 ; al_display_start: 161
558 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
559 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
560 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
561 SNKALKSQISALLSSIQNKAVADEKLTDQE
562 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
563 vs NC_009649.faa library
564
565 45119 residues in 180 sequences
566 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
567 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25
568 Lambda= 0.179384
569
570 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
571 join: 36, opt: 24, open/ext: -10/-2, width: 16
572 Scan time: 0.000
573 The best scores are: opt bits E(180)
574 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29
575
576 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
577 ; pg_name: /opt/fasta/fasta34
578 ; pg_ver: 34.26
579 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
580 ; pg_name: FASTA
581 ; pg_ver: 3.5 Sept 2006
582 ; pg_matrix: BL50 (15:-5)
583 ; pg_open-ext: -10 -2
584 ; pg_ktup: 2
585 ; pg_optcut: 24
586 ; pg_cgap: 36
587 ; mp_extrap: 60000 180
588 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384
589 ; mp_KS: -0.0000 (N=0) at 8159228
590 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
591 ; fa_frame: f
592 ; fa_initn: 50
593 ; fa_init1: 50
594 ; fa_opt: 58
595 ; fa_z-score: 95.8
596 ; fa_bits: 22.9
597 ; fa_expect: 0.29
598 ; sw_score: 58
599 ; sw_ident: 0.289
600 ; sw_sim: 0.632
601 ; sw_overlap: 38
602 >gi|10955264| ..
603 ; sq_len: 126
604 ; sq_offset: 1
605 ; sq_type: p
606 ; al_start: 1
607 ; al_stop: 38
608 ; al_display_start: 1
609 ------------------------------MKKDKKYQIEAIKNKDKTLF
610 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
611 NGEKFSSYTLNKVTKTDEYN
612 >gi|152973462|ref|YP_001338513.1| ..
613 ; sq_len: 101
614 ; sq_type: p
615 ; al_start: 44
616 ; al_stop: 81
617 ; al_display_start: 14
618 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
619 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
620 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
621 vs NC_009649.faa library
622
623 45119 residues in 180 sequences
624 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
625 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25
626 Lambda= 0.210386
627
628 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
629 join: 37, opt: 25, open/ext: -10/-2, width: 16
630 Scan time: 0.020
631 The best scores are: opt bits E(180)
632 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082
633
634 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
635 ; pg_name: /opt/fasta/fasta34
636 ; pg_ver: 34.26
637 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
638 ; pg_name: FASTA
639 ; pg_ver: 3.5 Sept 2006
640 ; pg_matrix: BL50 (15:-5)
641 ; pg_open-ext: -10 -2
642 ; pg_ktup: 2
643 ; pg_optcut: 25
644 ; pg_cgap: 37
645 ; mp_extrap: 60000 180
646 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386
647 ; mp_KS: -0.0000 (N=0) at 8159228
648 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
649 ; fa_frame: f
650 ; fa_initn: 52
651 ; fa_init1: 52
652 ; fa_opt: 70
653 ; fa_z-score: 105.5
654 ; fa_bits: 27.5
655 ; fa_expect: 0.082
656 ; sw_score: 70
657 ; sw_ident: 0.279
658 ; sw_sim: 0.651
659 ; sw_overlap: 43
660 >gi|10955265| ..
661 ; sq_len: 346
662 ; sq_offset: 1
663 ; sq_type: p
664 ; al_start: 197
665 ; al_stop: 238
666 ; al_display_start: 167
667 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
668 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
669 GEYFTENKPKYIIREIHQET
670 >gi|152973545|ref|YP_001338596.1| ..
671 ; sq_len: 242
672 ; sq_type: p
673 ; al_start: 52
674 ; al_stop: 94
675 ; al_display_start: 22
676 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
677 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
678 QDFAFTRKMRREARQVEQSW
679 >>><<<
680
681
682 579 residues in 3 query sequences
683 45119 residues in 180 library sequences
684 Scomplib [34.26]
685 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
686 Total Scan time: 0.020 Total Display time: 0.010
687
688 Function used was FASTA [version 34.26 January 12, 2007]
689
690 """
691
692
693 from StringIO import StringIO
694
695 alignments = list(FastaM10Iterator(StringIO(simple_example)))
696 assert len(alignments) == 4, len(alignments)
697 assert len(alignments[0].get_all_seqs()) == 2
698 for a in alignments :
699 print "Alignment %i sequences of length %i" \
700 % (len(a.get_all_seqs()), a.get_alignment_length())
701 for r in a :
702 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"])
703
704 print "Done"
705
706 import os
707 path = "../../Tests/Fasta/"
708 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"]
709 files.sort()
710 for filename in files :
711 if os.path.splitext(filename)[-1] == ".m10" :
712 print
713 print filename
714 print "="*len(filename)
715 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))):
716 print "#%i, %s" % (i+1,a)
717 for r in a :
718 if "-" in r.seq :
719 assert r.seq.alphabet.gap_char == "-"
720 else :
721 assert not hasattr(r.seq.alphabet, "gap_char")
722