1
2
3
4
5
6
7
8 """
9 This module provides code to work with the standalone version of
10 BLAST, either blastall or blastpgp, provided by the NCBI.
11 http://www.ncbi.nlm.nih.gov/BLAST/
12
13 Classes:
14 LowQualityBlastError Except that indicates low quality query sequences.
15 BlastParser Parses output from blast.
16 BlastErrorParser Parses output and tries to diagnose possible errors.
17 PSIBlastParser Parses output from psi-blast.
18 Iterator Iterates over a file of blast results.
19
20 _Scanner Scans output from standalone BLAST.
21 _BlastConsumer Consumes output from blast.
22 _PSIBlastConsumer Consumes output from psi-blast.
23 _HeaderConsumer Consumes header information.
24 _DescriptionConsumer Consumes description information.
25 _AlignmentConsumer Consumes alignment information.
26 _HSPConsumer Consumes hsp information.
27 _DatabaseReportConsumer Consumes database report information.
28 _ParametersConsumer Consumes parameters information.
29
30 Functions:
31 blastall Execute blastall.
32 blastpgp Execute blastpgp.
33 rpsblast Execute rpsblast.
34
35 """
36
37 import os
38 import re
39
40 from Bio import File
41 from Bio.ParserSupport import *
42 from Bio.Blast import Record
43
44
46 """Error caused by running a low quality sequence through BLAST.
47
48 When low quality sequences (like GenBank entries containing only
49 stretches of a single nucleotide) are BLASTed, they will result in
50 BLAST generating an error and not being able to perform the BLAST.
51 search. This error should be raised for the BLAST reports produced
52 in this case.
53 """
54 pass
55
57 """Error caused by running a short query sequence through BLAST.
58
59 If the query sequence is too short, BLAST outputs warnings and errors:
60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
61 [blastall] ERROR: [000.000] AT1G08320: Blast:
62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
63 done
64
65 This exception is raised when that condition is detected.
66
67 """
68 pass
69
70
72 """Scan BLAST output from blastall or blastpgp.
73
74 Tested with blastall and blastpgp v2.0.10, v2.0.11
75
76 Methods:
77 feed Feed data into the scanner.
78
79 """
80 - def feed(self, handle, consumer):
100
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 consumer.start_header()
146
147 read_and_call(uhandle, consumer.version, contains='BLAST')
148 read_and_call_while(uhandle, consumer.noevent, blank=1)
149
150
151 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
152
153
154 while attempt_read_and_call(uhandle,
155 consumer.reference, start='Reference') :
156
157
158 while 1:
159 line = uhandle.readline()
160 if is_blank_line(line) :
161 consumer.noevent(line)
162 break
163 elif line.startswith("RID"):
164 break
165 else :
166
167 consumer.reference(line)
168
169
170 read_and_call_while(uhandle, consumer.noevent, blank=1)
171 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
172 read_and_call_while(uhandle, consumer.noevent, blank=1)
173
174
175
176 if attempt_read_and_call(
177 uhandle, consumer.reference, start="Reference"):
178 read_and_call_until(uhandle, consumer.reference, blank=1)
179 read_and_call_while(uhandle, consumer.noevent, blank=1)
180
181
182 if attempt_read_and_call(
183 uhandle, consumer.reference, start="Reference"):
184 read_and_call_until(uhandle, consumer.reference, blank=1)
185 read_and_call_while(uhandle, consumer.noevent, blank=1)
186
187 line = uhandle.peekline()
188 assert line.strip() != ""
189 assert not line.startswith("RID:")
190 if line.startswith("Query=") :
191
192
193
194 read_and_call(uhandle, consumer.query_info, start='Query=')
195 read_and_call_until(uhandle, consumer.query_info, blank=1)
196 read_and_call_while(uhandle, consumer.noevent, blank=1)
197
198
199 read_and_call_until(uhandle, consumer.database_info, end='total letters')
200 read_and_call(uhandle, consumer.database_info, contains='sequences')
201 read_and_call_while(uhandle, consumer.noevent, blank=1)
202 elif line.startswith("Database:") :
203
204 read_and_call_until(uhandle, consumer.database_info, end='total letters')
205 read_and_call(uhandle, consumer.database_info, contains='sequences')
206 read_and_call_while(uhandle, consumer.noevent, blank=1)
207
208
209 read_and_call(uhandle, consumer.query_info, start='Query=')
210 read_and_call_until(uhandle, consumer.query_info, blank=1)
211 read_and_call_while(uhandle, consumer.noevent, blank=1)
212 else :
213 raise ValueError("Invalid header?")
214
215 consumer.end_header()
216
235
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259 consumer.start_descriptions()
260
261
262
263 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
264
265
266
267 if not uhandle.peekline():
268 raise ValueError("Unexpected end of blast report. " + \
269 "Looks suspiciously like a PSI-BLAST crash.")
270
271
272
273
274
275
276
277
278
279 line = uhandle.peekline()
280 if line.find("ERROR:") != -1 or line.startswith("done"):
281 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
282 read_and_call(uhandle, consumer.noevent, start="done")
283
284
285
286
287
288
289
290
291
292
293
294
295
296 read_and_call_while(uhandle, consumer.noevent, blank=1)
297
298 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
299 read_and_call_while(uhandle, consumer.noevent, blank=1)
300
301
302
303
304
305
306
307
308 if not attempt_read_and_call(
309 uhandle, consumer.description_header,
310 has_re=re.compile(r'Score +E')):
311
312 attempt_read_and_call(uhandle, consumer.no_hits,
313 contains='No hits found')
314 read_and_call_while(uhandle, consumer.noevent, blank=1)
315
316 consumer.end_descriptions()
317
318 return
319
320
321 read_and_call(uhandle, consumer.description_header,
322 start='Sequences producing')
323
324
325 attempt_read_and_call(uhandle, consumer.model_sequences,
326 start='Sequences used in model')
327 read_and_call_while(uhandle, consumer.noevent, blank=1)
328
329
330
331 if not uhandle.peekline().startswith('Sequences not found'):
332 read_and_call_until(uhandle, consumer.description, blank=1)
333 read_and_call_while(uhandle, consumer.noevent, blank=1)
334
335
336
337
338
339 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
340 start='Sequences not found'):
341
342 read_and_call_while(uhandle, consumer.noevent, blank=1)
343 l = safe_peekline(uhandle)
344
345
346
347
348 if not l.startswith('CONVERGED') and l[0] != '>' \
349 and not l.startswith('QUERY'):
350 read_and_call_until(uhandle, consumer.description, blank=1)
351 read_and_call_while(uhandle, consumer.noevent, blank=1)
352
353 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
354 read_and_call_while(uhandle, consumer.noevent, blank=1)
355
356 consumer.end_descriptions()
357
372
379
392
421
427
429
430
431
432
433
434
435 read_and_call(uhandle, consumer.score, start=' Score')
436 read_and_call(uhandle, consumer.identities, start=' Identities')
437
438 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
439
440 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
441 read_and_call(uhandle, consumer.noevent, blank=1)
442
444
445
446
447
448
449
450
451
452
453 while 1:
454
455 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
456 read_and_call(uhandle, consumer.query, start='Query')
457 read_and_call(uhandle, consumer.align, start=' ')
458 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
459 read_and_call_while(uhandle, consumer.noevent, blank=1)
460 line = safe_peekline(uhandle)
461
462 if not (line.startswith('Query') or line.startswith(' ')):
463 break
464
487
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 consumer.start_database_report()
517
518
519
520
521 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
522 read_and_call(uhandle, consumer.noevent, contains="letters")
523 read_and_call(uhandle, consumer.noevent, contains="sequences")
524 read_and_call(uhandle, consumer.noevent, start=" ")
525
526
527
528 while attempt_read_and_call(uhandle, consumer.database,
529 start=' Database'):
530
531
532
533 if not uhandle.peekline():
534 consumer.end_database_report()
535 return
536
537
538 read_and_call_until(uhandle, consumer.database, start=' Posted')
539 read_and_call(uhandle, consumer.posted_date, start=' Posted')
540 read_and_call(uhandle, consumer.num_letters_in_database,
541 start=' Number of letters')
542 read_and_call(uhandle, consumer.num_sequences_in_database,
543 start=' Number of sequences')
544
545 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
546
547 line = safe_readline(uhandle)
548 uhandle.saveline(line)
549 if line.find('Lambda') != -1:
550 break
551
552 read_and_call(uhandle, consumer.noevent, start='Lambda')
553 read_and_call(uhandle, consumer.ka_params)
554
555
556 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
557
558
559 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
560
561 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
562 read_and_call(uhandle, consumer.ka_params_gap)
563
564
565
566
567 try:
568 read_and_call_while(uhandle, consumer.noevent, blank=1)
569 except ValueError, x:
570 if str(x) != "Unexpected end of stream.":
571 raise
572 consumer.end_database_report()
573
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631 if not uhandle.peekline():
632 return
633
634
635
636 consumer.start_parameters()
637
638
639 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
640
641 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
642
643 attempt_read_and_call(uhandle, consumer.num_sequences,
644 start='Number of Sequences')
645 read_and_call(uhandle, consumer.num_hits,
646 start='Number of Hits')
647 attempt_read_and_call(uhandle, consumer.num_sequences,
648 start='Number of Sequences')
649 read_and_call(uhandle, consumer.num_extends,
650 start='Number of extensions')
651 read_and_call(uhandle, consumer.num_good_extends,
652 start='Number of successful')
653
654 read_and_call(uhandle, consumer.num_seqs_better_e,
655 start='Number of sequences')
656
657
658 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
659 start="Number of HSP's better"):
660
661 if attempt_read_and_call(uhandle, consumer.noevent,
662 start="Number of HSP's gapped:"):
663 read_and_call(uhandle, consumer.noevent,
664 start="Number of HSP's successfully")
665
666 attempt_read_and_call(uhandle, consumer.noevent,
667 start="Number of extra gapped extensions")
668 else:
669 read_and_call(uhandle, consumer.hsps_prelim_gapped,
670 start="Number of HSP's successfully")
671 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
672 start="Number of HSP's that")
673 read_and_call(uhandle, consumer.hsps_gapped,
674 start="Number of HSP's gapped")
675
676 elif attempt_read_and_call(uhandle, consumer.noevent,
677 start="Number of HSP's gapped"):
678 read_and_call(uhandle, consumer.noevent,
679 start="Number of HSP's successfully")
680
681
682 attempt_read_and_call(uhandle, consumer.query_length,
683 has_re=re.compile(r"[Ll]ength of query"))
684 read_and_call(uhandle, consumer.database_length,
685 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
686
687
688 attempt_read_and_call(uhandle, consumer.noevent,
689 start="Length adjustment")
690 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
691 start='effective HSP')
692
693 attempt_read_and_call(
694 uhandle, consumer.effective_query_length,
695 has_re=re.compile(r'[Ee]ffective length of query'))
696
697
698 attempt_read_and_call(
699 uhandle, consumer.effective_database_length,
700 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
701
702
703 attempt_read_and_call(
704 uhandle, consumer.effective_search_space,
705 has_re=re.compile(r'[Ee]ffective search space:'))
706
707 attempt_read_and_call(
708 uhandle, consumer.effective_search_space_used,
709 has_re=re.compile(r'[Ee]ffective search space used'))
710
711
712 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
713
714
715 attempt_read_and_call(uhandle, consumer.threshold, start='T')
716
717 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
718
719
720 attempt_read_and_call(uhandle, consumer.window_size, start='A')
721
722 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
723
724 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
725 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
726
727
728 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
729 start='X3')
730
731 read_and_call(uhandle, consumer.gap_trigger, start='S1')
732
733
734
735 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
736 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
737
738 consumer.end_parameters()
739
741 """Parses BLAST data into a Record.Blast object.
742
743 """
748
749 - def parse(self, handle):
750 """parse(self, handle)"""
751 self._scanner.feed(handle, self._consumer)
752 return self._consumer.data
753
755 """Parses BLAST data into a Record.PSIBlast object.
756
757 """
762
763 - def parse(self, handle):
764 """parse(self, handle)"""
765 self._scanner.feed(handle, self._consumer)
766 return self._consumer.data
767
771
773 c = line.split()
774 self._header.application = c[0]
775 self._header.version = c[1]
776 self._header.date = c[2][1:-1]
777
783
785 if line.startswith('Query= '):
786 self._header.query = line[7:]
787 elif not line.startswith(' '):
788 self._header.query = "%s%s" % (self._header.query, line)
789 else:
790 letters, = _re_search(
791 r"([0-9,]+) letters", line,
792 "I could not find the number of letters in line\n%s" % line)
793 self._header.query_letters = _safe_int(letters)
794
807
812
815 self._descriptions = []
816 self._model_sequences = []
817 self._nonmodel_sequences = []
818 self._converged = 0
819 self._type = None
820 self._roundnum = None
821
822 self.__has_n = 0
823
825 if line.startswith('Sequences producing'):
826 cols = line.split()
827 if cols[-1] == 'N':
828 self.__has_n = 1
829
831 dh = self._parse(line)
832 if self._type == 'model':
833 self._model_sequences.append(dh)
834 elif self._type == 'nonmodel':
835 self._nonmodel_sequences.append(dh)
836 else:
837 self._descriptions.append(dh)
838
841
843 self._type = 'nonmodel'
844
847
850
852 if not line.startswith('Results from round'):
853 raise ValueError("I didn't understand the round line\n%s" % line)
854 self._roundnum = _safe_int(line[18:].strip())
855
858
859 - def _parse(self, description_line):
860 line = description_line
861 dh = Record.Description()
862
863
864
865
866
867
868
869
870 cols = line.split()
871 if len(cols) < 3:
872 raise ValueError( \
873 "Line does not appear to contain description:\n%s" % line)
874 if self.__has_n:
875 i = line.rfind(cols[-1])
876 i = line.rfind(cols[-2], 0, i)
877 i = line.rfind(cols[-3], 0, i)
878 else:
879 i = line.rfind(cols[-1])
880 i = line.rfind(cols[-2], 0, i)
881 if self.__has_n:
882 dh.title, dh.score, dh.e, dh.num_alignments = \
883 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
884 else:
885 dh.title, dh.score, dh.e, dh.num_alignments = \
886 line[:i].rstrip(), cols[-2], cols[-1], 1
887 dh.num_alignments = _safe_int(dh.num_alignments)
888 dh.score = _safe_int(dh.score)
889 dh.e = _safe_float(dh.e)
890 return dh
891
893
894
895
896
900
904
906
907 parts = line.replace(" ","").split("=")
908 assert len(parts)==2, "Unrecognised format length line"
909 self._alignment.length = parts[1]
910 self._alignment.length = _safe_int(self._alignment.length)
911
913
914 if line.startswith('QUERY') or line.startswith('blast_tmp'):
915
916
917
918
919
920 try:
921 name, start, seq, end = line.split()
922 except ValueError:
923 raise ValueError("I do not understand the line\n%s" % line)
924 self._start_index = line.index(start, len(name))
925 self._seq_index = line.index(seq,
926 self._start_index+len(start))
927
928 self._name_length = self._start_index - 1
929 self._start_length = self._seq_index - self._start_index - 1
930 self._seq_length = line.rfind(end) - self._seq_index - 1
931
932
933
934
935
936
937
938
939
940 name = line[:self._name_length]
941 name = name.rstrip()
942 start = line[self._start_index:self._start_index+self._start_length]
943 start = start.rstrip()
944 if start:
945 start = _safe_int(start)
946 end = line[self._seq_index+self._seq_length:].rstrip()
947 if end:
948 end = _safe_int(end)
949 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
950
951 if len(seq) < self._seq_length:
952 seq = seq + ' '*(self._seq_length-len(seq))
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971 align = self._multiple_alignment.alignment
972 align.append((name, start, seq, end))
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1022
1023 if self._alignment:
1024 self._alignment.title = self._alignment.title.rstrip()
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 try:
1047 del self._seq_index
1048 del self._seq_length
1049 del self._start_index
1050 del self._start_length
1051 del self._name_length
1052 except AttributeError:
1053 pass
1054
1058
1060 self._hsp.bits, self._hsp.score = _re_search(
1061 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1062 "I could not find the score in line\n%s" % line)
1063 self._hsp.score = _safe_float(self._hsp.score)
1064 self._hsp.bits = _safe_float(self._hsp.bits)
1065
1066 x, y = _re_search(
1067 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1068 "I could not find the expect in line\n%s" % line)
1069 if x:
1070 self._hsp.num_alignments = _safe_int(x)
1071 else:
1072 self._hsp.num_alignments = 1
1073 self._hsp.expect = _safe_float(y)
1074
1076 x, y = _re_search(
1077 r"Identities = (\d+)\/(\d+)", line,
1078 "I could not find the identities in line\n%s" % line)
1079 self._hsp.identities = _safe_int(x), _safe_int(y)
1080 self._hsp.align_length = _safe_int(y)
1081
1082 if line.find('Positives') != -1:
1083 x, y = _re_search(
1084 r"Positives = (\d+)\/(\d+)", line,
1085 "I could not find the positives in line\n%s" % line)
1086 self._hsp.positives = _safe_int(x), _safe_int(y)
1087 assert self._hsp.align_length == _safe_int(y)
1088
1089 if line.find('Gaps') != -1:
1090 x, y = _re_search(
1091 r"Gaps = (\d+)\/(\d+)", line,
1092 "I could not find the gaps in line\n%s" % line)
1093 self._hsp.gaps = _safe_int(x), _safe_int(y)
1094 assert self._hsp.align_length == _safe_int(y)
1095
1096
1098 self._hsp.strand = _re_search(
1099 r"Strand = (\w+) / (\w+)", line,
1100 "I could not find the strand in line\n%s" % line)
1101
1103
1104
1105
1106 if line.find('/') != -1:
1107 self._hsp.frame = _re_search(
1108 r"Frame = ([-+][123]) / ([-+][123])", line,
1109 "I could not find the frame in line\n%s" % line)
1110 else:
1111 self._hsp.frame = _re_search(
1112 r"Frame = ([-+][123])", line,
1113 "I could not find the frame in line\n%s" % line)
1114
1115
1116
1117
1118
1119
1120 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1122 m = self._query_re.search(line)
1123 if m is None:
1124 raise ValueError("I could not find the query in line\n%s" % line)
1125
1126
1127
1128 colon, start, seq, end = m.groups()
1129 self._hsp.query = self._hsp.query + seq
1130 if self._hsp.query_start is None:
1131 self._hsp.query_start = _safe_int(start)
1132
1133
1134
1135 self._hsp.query_end = _safe_int(end)
1136
1137
1138 self._query_start_index = m.start(3)
1139 self._query_len = len(seq)
1140
1142 seq = line[self._query_start_index:].rstrip()
1143 if len(seq) < self._query_len:
1144
1145 seq = seq + ' ' * (self._query_len-len(seq))
1146 elif len(seq) < self._query_len:
1147 raise ValueError("Match is longer than the query in line\n%s" \
1148 % line)
1149 self._hsp.match = self._hsp.match + seq
1150
1151
1152
1153 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1155 m = self._sbjct_re.search(line)
1156 if m is None:
1157 raise ValueError("I could not find the sbjct in line\n%s" % line)
1158 colon, start, seq, end = m.groups()
1159
1160
1161
1162
1163 if not seq.strip():
1164 seq = ' ' * self._query_len
1165 self._hsp.sbjct = self._hsp.sbjct + seq
1166 if self._hsp.sbjct_start is None:
1167 self._hsp.sbjct_start = _safe_int(start)
1168
1169 self._hsp.sbjct_end = _safe_int(end)
1170 if len(seq) != self._query_len:
1171 raise ValueError( \
1172 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1173 % line)
1174
1175 del self._query_start_index
1176 del self._query_len
1177
1180
1182
1185
1194
1195 - def posted_date(self, line):
1196 self._dr.posted_date.append(_re_search(
1197 r"Posted date:\s*(.+)$", line,
1198 "I could not find the posted date in line\n%s" % line))
1199
1204
1209
1213
1216
1220
1223
1227
1230
1235
1237 if line.find('1st pass') != -1:
1238 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1239 self._params.num_hits = _safe_int(x)
1240 else:
1241 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1242 self._params.num_hits = _safe_int(x)
1243
1245 if line.find('1st pass') != -1:
1246 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1247 self._params.num_sequences = _safe_int(x)
1248 else:
1249 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1250 self._params.num_sequences = _safe_int(x)
1251
1253 if line.find('1st pass') != -1:
1254 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1255 self._params.num_extends = _safe_int(x)
1256 else:
1257 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1258 self._params.num_extends = _safe_int(x)
1259
1261 if line.find('1st pass') != -1:
1262 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1263 self._params.num_good_extends = _safe_int(x)
1264 else:
1265 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1266 self._params.num_good_extends = _safe_int(x)
1267
1273
1278
1284
1290
1295
1300
1305
1311
1317
1323
1329
1335
1339
1341 if line[:2] == "T:" :
1342
1343 self._params.threshold, = _get_cols(
1344 line, (1,), ncols=2, expected={0:"T:"})
1345 elif line[:28] == "Neighboring words threshold:" :
1346 self._params.threshold, = _get_cols(
1347 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
1348 else :
1349 raise ValueError("Unrecognised threshold line:\n%s" % line)
1350 self._params.threshold = _safe_int(self._params.threshold)
1351
1353 if line[:2] == "A:" :
1354 self._params.window_size, = _get_cols(
1355 line, (1,), ncols=2, expected={0:"A:"})
1356 elif line[:25] == "Window for multiple hits:" :
1357 self._params.window_size, = _get_cols(
1358 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
1359 else :
1360 raise ValueError("Unrecognised window size line:\n%s" % line)
1361 self._params.window_size = _safe_int(self._params.window_size)
1362
1368
1374
1380
1386
1392
1395
1396
1397 -class _BlastConsumer(AbstractConsumer,
1398 _HeaderConsumer,
1399 _DescriptionConsumer,
1400 _AlignmentConsumer,
1401 _HSPConsumer,
1402 _DatabaseReportConsumer,
1403 _ParametersConsumer
1404 ):
1405
1406
1407
1408
1409
1410
1411
1412
1413
1416
1418
1419 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1420
1424
1428
1430 self.data.descriptions = self._descriptions
1431
1433 _AlignmentConsumer.end_alignment(self)
1434 if self._alignment.hsps:
1435 self.data.alignments.append(self._alignment)
1436 if self._multiple_alignment.alignment:
1437 self.data.multiple_alignment = self._multiple_alignment
1438
1440 _HSPConsumer.end_hsp(self)
1441 try:
1442 self._alignment.hsps.append(self._hsp)
1443 except AttributeError:
1444 raise ValueError("Found an HSP before an alignment")
1445
1449
1453
1454 -class _PSIBlastConsumer(AbstractConsumer,
1455 _HeaderConsumer,
1456 _DescriptionConsumer,
1457 _AlignmentConsumer,
1458 _HSPConsumer,
1459 _DatabaseReportConsumer,
1460 _ParametersConsumer
1461 ):
1464
1468
1472
1477
1479 _DescriptionConsumer.end_descriptions(self)
1480 self._round.number = self._roundnum
1481 if self._descriptions:
1482 self._round.new_seqs.extend(self._descriptions)
1483 self._round.reused_seqs.extend(self._model_sequences)
1484 self._round.new_seqs.extend(self._nonmodel_sequences)
1485 if self._converged:
1486 self.data.converged = 1
1487
1489 _AlignmentConsumer.end_alignment(self)
1490 if self._alignment.hsps:
1491 self._round.alignments.append(self._alignment)
1492 if self._multiple_alignment:
1493 self._round.multiple_alignment = self._multiple_alignment
1494
1496 _HSPConsumer.end_hsp(self)
1497 try:
1498 self._alignment.hsps.append(self._hsp)
1499 except AttributeError:
1500 raise ValueError("Found an HSP before an alignment")
1501
1505
1509
1511 """Iterates over a file of multiple BLAST results.
1512
1513 Methods:
1514 next Return the next record from the stream, or None.
1515
1516 """
1517 - def __init__(self, handle, parser=None):
1518 """__init__(self, handle, parser=None)
1519
1520 Create a new iterator. handle is a file-like object. parser
1521 is an optional Parser object to change the results into another form.
1522 If set to None, then the raw contents of the file will be returned.
1523
1524 """
1525 try:
1526 handle.readline
1527 except AttributeError:
1528 raise ValueError(
1529 "I expected a file handle or file-like object, got %s"
1530 % type(handle))
1531 self._uhandle = File.UndoHandle(handle)
1532 self._parser = parser
1533
1535 """next(self) -> object
1536
1537 Return the next Blast record from the file. If no more records,
1538 return None.
1539
1540 """
1541 lines = []
1542 while 1:
1543 line = self._uhandle.readline()
1544 if not line:
1545 break
1546
1547 if lines and (line.startswith('BLAST')
1548 or line.startswith('BLAST', 1)
1549 or line.startswith('<?xml ')):
1550 self._uhandle.saveline(line)
1551 break
1552 lines.append(line)
1553
1554 if not lines:
1555 return None
1556
1557 data = ''.join(lines)
1558 if self._parser is not None:
1559 return self._parser.parse(File.StringHandle(data))
1560 return data
1561
1563 return iter(self.next, None)
1564
1565 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1566 """Execute and retrieve data from standalone BLASTPALL as handles.
1567
1568 Execute and retrieve data from blastall. blastcmd is the command
1569 used to launch the 'blastall' executable. program is the blast program
1570 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1571 to search against. infile is the path to the file containing
1572 the sequence to search with.
1573
1574 The return values are two handles, for standard output and standard error.
1575
1576 You may pass more parameters to **keywds to change the behavior of
1577 the search. Otherwise, optional values will be chosen by blastall.
1578 The Blast output is by default in XML format. Use the align_view keyword
1579 for output in a different format.
1580
1581 Scoring
1582 matrix Matrix to use.
1583 gap_open Gap open penalty.
1584 gap_extend Gap extension penalty.
1585 nuc_match Nucleotide match reward. (BLASTN)
1586 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1587 query_genetic_code Genetic code for Query.
1588 db_genetic_code Genetic code for database. (TBLAST[NX])
1589
1590 Algorithm
1591 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1592 expectation Expectation value cutoff.
1593 wordsize Word size.
1594 strands Query strands to search against database.([T]BLAST[NX])
1595 keep_hits Number of best hits from a region to keep.
1596 xdrop Dropoff value (bits) for gapped alignments.
1597 hit_extend Threshold for extending hits.
1598 region_length Length of region used to judge hits.
1599 db_length Effective database length.
1600 search_length Effective length of search space.
1601
1602 Processing
1603 filter Filter query sequence for low complexity (with SEG)? T/F
1604 believe_query Believe the query defline. T/F
1605 restrict_gi Restrict search to these GI's.
1606 nprocessors Number of processors to use.
1607 oldengine Force use of old engine T/F
1608
1609 Formatting
1610 html Produce HTML output? T/F
1611 descriptions Number of one-line descriptions.
1612 alignments Number of alignments.
1613 align_view Alignment view. Integer 0-11,
1614 passed as a string or integer.
1615 show_gi Show GI's in deflines? T/F
1616 seqalign_file seqalign file to output.
1617 outfile Output file for report. Filename to write to, if
1618 ommitted standard output is used (which you can access
1619 from the returned handles).
1620 """
1621
1622 _security_check_parameters(keywds)
1623
1624 att2param = {
1625 'matrix' : '-M',
1626 'gap_open' : '-G',
1627 'gap_extend' : '-E',
1628 'nuc_match' : '-r',
1629 'nuc_mismatch' : '-q',
1630 'query_genetic_code' : '-Q',
1631 'db_genetic_code' : '-D',
1632
1633 'gapped' : '-g',
1634 'expectation' : '-e',
1635 'wordsize' : '-W',
1636 'strands' : '-S',
1637 'keep_hits' : '-K',
1638 'xdrop' : '-X',
1639 'hit_extend' : '-f',
1640 'region_length' : '-L',
1641 'db_length' : '-z',
1642 'search_length' : '-Y',
1643
1644 'program' : '-p',
1645 'database' : '-d',
1646 'infile' : '-i',
1647 'filter' : '-F',
1648 'believe_query' : '-J',
1649 'restrict_gi' : '-l',
1650 'nprocessors' : '-a',
1651 'oldengine' : '-V',
1652
1653 'html' : '-T',
1654 'descriptions' : '-v',
1655 'alignments' : '-b',
1656 'align_view' : '-m',
1657 'show_gi' : '-I',
1658 'seqalign_file' : '-O',
1659 'outfile' : '-o',
1660 }
1661
1662 params = []
1663 params.extend([att2param['program'], program])
1664 params.extend([att2param['database'], database])
1665 params.extend([att2param['infile'], _escape_filename(infile)])
1666 params.extend([att2param['align_view'], str(align_view)])
1667
1668 for attr in keywds.keys():
1669 params.extend([att2param[attr], str(keywds[attr])])
1670
1671 return _invoke_blast(blastcmd, params)
1672
1673 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1674 """Execute and retrieve data from standalone BLASTPGP as handles.
1675
1676 Execute and retrieve data from blastpgp. blastcmd is the command
1677 used to launch the 'blastpgp' executable. database is the path to the
1678 database to search against. infile is the path to the file containing
1679 the sequence to search with.
1680
1681 The return values are two handles, for standard output and standard error.
1682
1683 You may pass more parameters to **keywds to change the behavior of
1684 the search. Otherwise, optional values will be chosen by blastpgp.
1685 The Blast output is by default in XML format. Use the align_view keyword
1686 for output in a different format.
1687
1688 Scoring
1689 matrix Matrix to use.
1690 gap_open Gap open penalty.
1691 gap_extend Gap extension penalty.
1692 window_size Multiple hits window size.
1693 npasses Number of passes.
1694 passes Hits/passes. Integer 0-2.
1695
1696 Algorithm
1697 gapped Whether to do a gapped alignment. T/F
1698 expectation Expectation value cutoff.
1699 wordsize Word size.
1700 keep_hits Number of beset hits from a region to keep.
1701 xdrop Dropoff value (bits) for gapped alignments.
1702 hit_extend Threshold for extending hits.
1703 region_length Length of region used to judge hits.
1704 db_length Effective database length.
1705 search_length Effective length of search space.
1706 nbits_gapping Number of bits to trigger gapping.
1707 pseudocounts Pseudocounts constants for multiple passes.
1708 xdrop_final X dropoff for final gapped alignment.
1709 xdrop_extension Dropoff for blast extensions.
1710 model_threshold E-value threshold to include in multipass model.
1711 required_start Start of required region in query.
1712 required_end End of required region in query.
1713
1714 Processing
1715 XXX should document default values
1716 program The blast program to use. (PHI-BLAST)
1717 filter Filter query sequence for low complexity (with SEG)? T/F
1718 believe_query Believe the query defline? T/F
1719 nprocessors Number of processors to use.
1720
1721 Formatting
1722 html Produce HTML output? T/F
1723 descriptions Number of one-line descriptions.
1724 alignments Number of alignments.
1725 align_view Alignment view. Integer 0-11,
1726 passed as a string or integer.
1727 show_gi Show GI's in deflines? T/F
1728 seqalign_file seqalign file to output.
1729 align_outfile Output file for alignment.
1730 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1731 restart_infile Input file for PSI-BLAST restart.
1732 hit_infile Hit file for PHI-BLAST.
1733 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1734 align_outfile Output file for alignment. Filename to write to, if
1735 ommitted standard output is used (which you can access
1736 from the returned handles).
1737
1738 align_infile Input alignment file for PSI-BLAST restart.
1739
1740 """
1741
1742 _security_check_parameters(keywds)
1743
1744 att2param = {
1745 'matrix' : '-M',
1746 'gap_open' : '-G',
1747 'gap_extend' : '-E',
1748 'window_size' : '-A',
1749 'npasses' : '-j',
1750 'passes' : '-P',
1751
1752 'gapped' : '-g',
1753 'expectation' : '-e',
1754 'wordsize' : '-W',
1755 'keep_hits' : '-K',
1756 'xdrop' : '-X',
1757 'hit_extend' : '-f',
1758 'region_length' : '-L',
1759 'db_length' : '-Z',
1760 'search_length' : '-Y',
1761 'nbits_gapping' : '-N',
1762 'pseudocounts' : '-c',
1763 'xdrop_final' : '-Z',
1764 'xdrop_extension' : '-y',
1765 'model_threshold' : '-h',
1766 'required_start' : '-S',
1767 'required_end' : '-H',
1768
1769 'program' : '-p',
1770 'database' : '-d',
1771 'infile' : '-i',
1772 'filter' : '-F',
1773 'believe_query' : '-J',
1774 'nprocessors' : '-a',
1775
1776 'html' : '-T',
1777 'descriptions' : '-v',
1778 'alignments' : '-b',
1779 'align_view' : '-m',
1780 'show_gi' : '-I',
1781 'seqalign_file' : '-O',
1782 'align_outfile' : '-o',
1783 'checkpoint_outfile' : '-C',
1784 'restart_infile' : '-R',
1785 'hit_infile' : '-k',
1786 'matrix_outfile' : '-Q',
1787 'align_infile' : '-B',
1788 }
1789
1790 params = []
1791 params.extend([att2param['database'], database])
1792 params.extend([att2param['infile'], _escape_filename(infile)])
1793 params.extend([att2param['align_view'], str(align_view)])
1794
1795 for attr in keywds.keys():
1796 params.extend([att2param[attr], str(keywds[attr])])
1797
1798 return _invoke_blast(blastcmd, params)
1799
1800 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1801 """Execute and retrieve data from standalone RPS-BLAST as handles.
1802
1803 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1804 command used to launch the 'rpsblast' executable. database is the path
1805 to the database to search against. infile is the path to the file
1806 containing the sequence to search with.
1807
1808 The return values are two handles, for standard output and standard error.
1809
1810 You may pass more parameters to **keywds to change the behavior of
1811 the search. Otherwise, optional values will be chosen by rpsblast.
1812
1813 Please note that this function will give XML output by default, by
1814 setting align_view to seven (i.e. command line option -m 7).
1815 You should use the NCBIXML.parse() function to read the resulting output.
1816 This is because NCBIStandalone.BlastParser() does not understand the
1817 plain text output format from rpsblast.
1818
1819 WARNING - The following text and associated parameter handling has not
1820 received extensive testing. Please report any errors we might have made...
1821
1822 Algorithm/Scoring
1823 gapped Whether to do a gapped alignment. T/F
1824 multihit 0 for multiple hit (default), 1 for single hit
1825 expectation Expectation value cutoff.
1826 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1827 0 in 'start' refers to the beginning of the sequence
1828 0 in 'stop' refers to the end of the sequence
1829 Default = 0,0
1830 xdrop Dropoff value (bits) for gapped alignments.
1831 xdrop_final X dropoff for final gapped alignment (in bits).
1832 xdrop_extension Dropoff for blast extensions (in bits).
1833 search_length Effective length of search space.
1834 nbits_gapping Number of bits to trigger gapping.
1835 protein Query sequence is protein. T/F
1836 db_length Effective database length.
1837
1838 Processing
1839 filter Filter query sequence for low complexity? T/F
1840 case_filter Use lower case filtering of FASTA sequence T/F, default F
1841 believe_query Believe the query defline. T/F
1842 nprocessors Number of processors to use.
1843 logfile Name of log file to use, default rpsblast.log
1844
1845 Formatting
1846 html Produce HTML output? T/F
1847 descriptions Number of one-line descriptions.
1848 alignments Number of alignments.
1849 align_view Alignment view. Integer 0-11,
1850 passed as a string or integer.
1851 show_gi Show GI's in deflines? T/F
1852 seqalign_file seqalign file to output.
1853 align_outfile Output file for alignment. Filename to write to, if
1854 ommitted standard output is used (which you can access
1855 from the returned handles).
1856 """
1857
1858 _security_check_parameters(keywds)
1859
1860 att2param = {
1861 'multihit' : '-P',
1862 'gapped' : '-g',
1863 'expectation' : '-e',
1864 'range_restriction' : '-L',
1865 'xdrop' : '-X',
1866 'xdrop_final' : '-Z',
1867 'xdrop_extension' : '-y',
1868 'search_length' : '-Y',
1869 'nbits_gapping' : '-N',
1870 'protein' : '-p',
1871 'db_length' : '-z',
1872
1873 'database' : '-d',
1874 'infile' : '-i',
1875 'filter' : '-F',
1876 'case_filter' : '-U',
1877 'believe_query' : '-J',
1878 'nprocessors' : '-a',
1879 'logfile' : '-l',
1880
1881 'html' : '-T',
1882 'descriptions' : '-v',
1883 'alignments' : '-b',
1884 'align_view' : '-m',
1885 'show_gi' : '-I',
1886 'seqalign_file' : '-O',
1887 'align_outfile' : '-o',
1888 }
1889
1890 params = []
1891
1892 params.extend([att2param['database'], database])
1893 params.extend([att2param['infile'], _escape_filename(infile)])
1894 params.extend([att2param['align_view'], str(align_view)])
1895
1896 for attr in keywds.keys():
1897 params.extend([att2param[attr], str(keywds[attr])])
1898
1899 return _invoke_blast(blastcmd, params)
1900
1902 m = re.search(regex, line)
1903 if not m:
1904 raise ValueError(error_msg)
1905 return m.groups()
1906
1907 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1908 cols = line.split()
1909
1910
1911 if ncols is not None and len(cols) != ncols:
1912 raise ValueError("I expected %d columns (got %d) in line\n%s" \
1913 % (ncols, len(cols), line))
1914
1915
1916 for k in expected.keys():
1917 if cols[k] != expected[k]:
1918 raise ValueError("I expected '%s' in column %d in line\n%s" \
1919 % (expected[k], k, line))
1920
1921
1922 results = []
1923 for c in cols_to_get:
1924 results.append(cols[c])
1925 return tuple(results)
1926
1928 try:
1929 return int(str)
1930 except ValueError:
1931
1932
1933 str = str.replace(',', '')
1934 try:
1935
1936 return int(str)
1937 except ValueError:
1938 pass
1939
1940
1941 return long(float(str))
1942
1944
1945
1946
1947
1948
1949 if str and str[0] in ['E', 'e']:
1950 str = '1' + str
1951 try:
1952 return float(str)
1953 except ValueError:
1954
1955 str = str.replace(',', '')
1956
1957 return float(str)
1958
1960 """Escape filenames with spaces (PRIVATE)."""
1961 if " " not in filename :
1962 return filename
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978 if filename.startswith('"') and filename.endswith('"') :
1979
1980 return filename
1981 else :
1982 return '"%s"' % filename
1983
1985 """Start BLAST and returns handles for stdout and stderr (PRIVATE).
1986
1987 Tries to deal with spaces in the BLAST executable path.
1988 """
1989 if not os.path.exists(blast_cmd):
1990 raise ValueError("BLAST executable does not exist at %s" % blast_cmd)
1991
1992 cmd_string = " ".join([_escape_filename(blast_cmd)] + params)
1993
1994
1995 try :
1996 import subprocess, sys
1997
1998
1999
2000
2001 blast_process = subprocess.Popen(cmd_string,
2002 stdin=subprocess.PIPE,
2003 stdout=subprocess.PIPE,
2004 stderr=subprocess.PIPE,
2005 shell=(sys.platform!="win32"))
2006 blast_process.stdin.close()
2007 return blast_process.stdout, blast_process.stderr
2008 except ImportError :
2009
2010
2011 write_handle, result_handle, error_handle \
2012 = os.popen3(cmd_string)
2013 write_handle.close()
2014 return result_handle, error_handle
2015
2017 """Look for any attempt to insert a command into a parameter.
2018
2019 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd')
2020
2021 Looks for ";" or "&&" in the strings (Unix and Windows syntax
2022 for appending a command line), or ">", "<" or "|" (redirection)
2023 and if any are found raises an exception.
2024 """
2025 for key, value in param_dict.iteritems() :
2026 str_value = str(value)
2027 for bad_str in [";", "&&", ">", "<", "|"] :
2028 if bad_str in str_value :
2029 raise ValueError("Rejecting suspicious argument for %s" % key)
2030
2041
2043 """Attempt to catch and diagnose BLAST errors while parsing.
2044
2045 This utilizes the BlastParser module but adds an additional layer
2046 of complexity on top of it by attempting to diagnose ValueErrors
2047 that may actually indicate problems during BLAST parsing.
2048
2049 Current BLAST problems this detects are:
2050 o LowQualityBlastError - When BLASTing really low quality sequences
2051 (ie. some GenBank entries which are just short streches of a single
2052 nucleotide), BLAST will report an error with the sequence and be
2053 unable to search with this. This will lead to a badly formatted
2054 BLAST report that the parsers choke on. The parser will convert the
2055 ValueError to a LowQualityBlastError and attempt to provide useful
2056 information.
2057
2058 """
2059 - def __init__(self, bad_report_handle = None):
2060 """Initialize a parser that tries to catch BlastErrors.
2061
2062 Arguments:
2063 o bad_report_handle - An optional argument specifying a handle
2064 where bad reports should be sent. This would allow you to save
2065 all of the bad reports to a file, for instance. If no handle
2066 is specified, the bad reports will not be saved.
2067 """
2068 self._bad_report_handle = bad_report_handle
2069
2070
2071 self._scanner = _Scanner()
2072 self._consumer = _BlastErrorConsumer()
2073
2074 - def parse(self, handle):
2075 """Parse a handle, attempting to diagnose errors.
2076 """
2077 results = handle.read()
2078
2079 try:
2080 self._scanner.feed(File.StringHandle(results), self._consumer)
2081 except ValueError, msg:
2082
2083 if self._bad_report_handle:
2084
2085 self._bad_report_handle.write(results)
2086
2087
2088 self._diagnose_error(
2089 File.StringHandle(results), self._consumer.data)
2090
2091
2092
2093 raise
2094 return self._consumer.data
2095
2097 """Attempt to diagnose an error in the passed handle.
2098
2099 Arguments:
2100 o handle - The handle potentially containing the error
2101 o data_record - The data record partially created by the consumer.
2102 """
2103 line = handle.readline()
2104
2105 while line:
2106
2107
2108
2109 if line.startswith('Searchingdone'):
2110 raise LowQualityBlastError("Blast failure occured on query: ",
2111 data_record.query)
2112 line = handle.readline()
2113