1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the WWW version of BLAST
11 provided by the NCBI.
12 http://www.ncbi.nlm.nih.gov/BLAST/
13
14 Classes:
15 BlastParser Parses output from WWW blast.
16 _Scanner Scans output from NCBI's BLAST WWW server.
17
18 Functions:
19 qblast Do a BLAST search using the QBLAST API.
20
21 """
22 import re
23
24 try:
25 import cStringIO as StringIO
26 except ImportError:
27 import StringIO
28
29 from Bio.ParserSupport import *
30
32 """Parses WWW BLAST data into a Record.Blast object.
33
34 """
40
42 """parse(self, handle)"""
43 self._scanner.feed(handle, self._consumer)
44 return self._consumer.data
45
47 """Scan BLAST output from NCBI's web server at:
48 http://www.ncbi.nlm.nih.gov/BLAST/
49
50 Tested with BLAST v2.0.10
51
52 Methods:
53 feed Feed data into the scanner.
54
55 """
56 - def feed(self, handle, consumer):
104
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139 consumer.start_header()
140
141
142 read_and_call(uhandle, consumer.version, contains='BLAST')
143 read_and_call_while(uhandle, consumer.noevent, blank=1)
144
145
146
147 while 1:
148 line = uhandle.readline()
149 if line[:3] == '<p>' or not line.strip():
150 consumer.noevent(line)
151 break
152 consumer.reference(line)
153
154
155 attempt_read_and_call(uhandle, consumer.noevent, start='RID')
156
157
158
159 attempt_read_and_call(uhandle, consumer.noevent)
160
161
162
163
164 if uhandle.peekline().find("Query=") >= 0:
165 self._scan_query_info(uhandle, consumer)
166 self._scan_database_info(uhandle, consumer)
167 else:
168 self._scan_database_info(uhandle, consumer)
169 self._scan_query_info(uhandle, consumer)
170 read_and_call_while(uhandle, consumer.noevent, blank=1)
171 consumer.end_header()
172
183
185 attempt_read_and_call(uhandle, consumer.noevent, start='<p>')
186 read_and_call(uhandle, consumer.database_info, contains='Database')
187
188
189
190 read_and_call_until(uhandle, consumer.database_info,
191 contains='sequences;')
192 read_and_call(uhandle, consumer.database_info, contains='sequences;')
193 read_and_call(uhandle, consumer.noevent, blank=1)
194 attempt_read_and_call(uhandle, consumer.noevent,
195 contains='problems or questions')
196 self._scan_blastform(uhandle, consumer)
197
198 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
199 if attempt_read_and_call(uhandle, consumer.noevent,
200 start="<table border=0 width=600"):
201 read_and_call_until(uhandle, consumer.noevent,
202 contains="</table>")
203 consumer.noevent(uhandle.readline())
204 read_and_call(uhandle, consumer.noevent, blank=1)
205
206 attempt_read_and_call(uhandle, consumer.noevent, start="<p>")
207
208 if attempt_read_and_call(uhandle, consumer.noevent,
209 contains="Taxonomy reports"):
210 read_and_call(uhandle, consumer.noevent, start="<BR>")
211 attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>")
212
213
214
215
216
217
218
219
220 if attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>"):
221 read_and_call_until(uhandle, consumer.noevent, start="<PRE>")
222 while 1:
223 line = uhandle.peekline()
224 if not line[:5] == "<PRE>" or line.find("Query=") >= 0:
225 break
226 read_and_call(uhandle, consumer.noevent, start="<PRE>")
227
228 read_and_call_while(uhandle, consumer.noevent, blank=1)
229
231
232 read_and_call(uhandle, consumer.query_info, contains='Query=')
233 read_and_call_until(uhandle, consumer.query_info, blank=1)
234 read_and_call_while(uhandle, consumer.noevent, blank=1)
235 if attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>"):
236 read_and_call_while(uhandle, consumer.noevent, blank=1)
237 self._scan_blastform(uhandle, consumer)
238
242
244 consumer.start_descriptions()
245
246
247
248
249
250 if not attempt_read_and_call(
251 uhandle, consumer.description_header,
252 has_re=re.compile(r"Score {4,5}E")):
253
254 attempt_read_and_call(uhandle, consumer.no_hits,
255 contains='No significant similarity')
256 read_and_call_while(uhandle, consumer.noevent, blank=1)
257 consumer.end_descriptions()
258
259 return
260
261
262
263
264
265
266 read_and_call(uhandle, consumer.description_header,
267 start='Sequences producing')
268 read_and_call(uhandle, consumer.noevent, blank=1)
269
270
271
272
273 read_and_call_while(uhandle, consumer.description,
274 blank=0, contains='<a')
275
276
277 if not attempt_read_and_call(uhandle, consumer.noevent,
278 contains='</PRE>'):
279 read_and_call_while(uhandle, consumer.noevent, blank=1)
280
281 consumer.end_descriptions()
282
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309 line1 = safe_readline(uhandle)
310 line2 = safe_readline(uhandle)
311 uhandle.saveline(line2)
312 uhandle.saveline(line1)
313
314 is_pairwise = is_masterslave = 0
315 if 'Alignments' in line2:
316 is_pairwise = 1
317 elif line2.startswith(' Database'):
318 pass
319 elif line2.startswith('Lambda K H'):
320 pass
321 elif line2.startswith('blast_tmp'):
322 is_masterslave = 1
323 elif line1.startswith('<PRE>'):
324 is_pairwise = 1
325 else:
326 raise ValueError, "Cannot resolve location at lines:\n%s\n%s" % (line1, line2)
327
328 if is_pairwise:
329 self._scan_pairwise_alignments(uhandle, consumer)
330 elif is_masterslave:
331 self._scan_masterslave_alignment(uhandle, consumer)
332
355
371
411
429
435
437
438
439
440
441
442
443
444
445
446 attempt_read_and_call(uhandle, consumer.noevent, start='<PRE>')
447 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
448 read_and_call(uhandle, consumer.score,
449 has_re=re.compile(r'^ (<a[^>]*></a>)*Score'))
450 read_and_call(uhandle, consumer.identities, start=' Identities')
451
452 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
453
454 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
455 read_and_call(uhandle, consumer.noevent, blank=1)
456
458
459
460
461
462
463
464
465
466
467
468
469 while 1:
470
471 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
472 read_and_call(uhandle, consumer.query, start='Query')
473 read_and_call(uhandle, consumer.align, start=' ')
474 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
475 if not attempt_read_and_call(uhandle, consumer.noevent, blank=1):
476 break
477 read_and_call(uhandle, consumer.noevent, start='</PRE>')
478 read_and_call_while(uhandle, consumer.noevent, blank=1)
479
494
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513 consumer.start_database_report()
514
515
516
517 line = uhandle.peekline()
518
519
520 if line.find("Database") < 0:
521 read_and_call(uhandle, consumer.noevent, start='<PRE>')
522 line2 = uhandle.peekline()
523 if line2.find("Lambda K H") < 0:
524 read_and_call(uhandle, consumer.database, contains=' Database')
525 read_and_call_until(uhandle, consumer.database, contains="Posted")
526 read_and_call(uhandle, consumer.posted_date, start=' Posted')
527 read_and_call(uhandle, consumer.num_letters_in_database,
528 start=' Number of letters')
529 read_and_call(uhandle, consumer.num_sequences_in_database,
530 start=' Number of sequences')
531 read_and_call(uhandle, consumer.noevent, start=' ')
532
533 read_and_call(uhandle, consumer.noevent, start='Lambda')
534 read_and_call(uhandle, consumer.ka_params)
535 read_and_call(uhandle, consumer.noevent, blank=1)
536
537
538 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
539
540 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
541 read_and_call(uhandle, consumer.ka_params_gap)
542 read_and_call_while(uhandle, consumer.noevent, blank=1)
543
544 consumer.end_database_report()
545
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573 consumer.start_parameters()
574
575
576 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
577
578 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
579
580
581
582 if attempt_read_and_call(uhandle, consumer.num_hits,
583 start='Number of Hits'):
584 read_and_call(uhandle, consumer.num_sequences,
585 start='Number of Sequences')
586 else:
587 read_and_call(uhandle, consumer.num_sequences,
588 start='Number of Sequences')
589 read_and_call(uhandle, consumer.num_hits,
590 start='Number of Hits')
591
592 read_and_call(uhandle, consumer.num_extends,
593 start='Number of extensions')
594 read_and_call(uhandle, consumer.num_good_extends,
595 start='Number of successful')
596
597 read_and_call(uhandle, consumer.num_seqs_better_e,
598 start='Number of sequences')
599
600
601 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
602 start="Number of HSP's better"):
603
604 if attempt_read_and_call(uhandle, consumer.hsps_prelim_gapped,
605 start="Number of HSP's successfully"):
606 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
607 start="Number of HSP's that")
608 read_and_call(uhandle, consumer.hsps_gapped,
609 start="Number of HSP's gapped")
610 else:
611 read_and_call(uhandle, consumer.no_event,
612 start="Number of HSP's gapped")
613 read_and_call(uhandle, consumer.no_event,
614 start="Number of HSP's successfully")
615 read_and_call(uhandle, consumer.no_event,
616 start="Number of extra gapped")
617
618
619 if attempt_read_and_call(uhandle, consumer.query_length,
620 start='Length of query'):
621 read_and_call(uhandle, consumer.database_length,
622 start='Length of database')
623 read_and_call(uhandle, consumer.no_event,
624 start='Length adjustment')
625 attempt_read_and_call(uhandle, consumer.effective_query_length,
626 start='Effective length of query')
627 read_and_call(uhandle, consumer.effective_database_length,
628 start='Effective length of database')
629 attempt_read_and_call(uhandle, consumer.effective_search_space,
630 start='Effective search space:')
631 attempt_read_and_call(uhandle, consumer.effective_search_space_used,
632 start='Effective search space used')
633
634 else:
635 attempt_read_and_call(uhandle, consumer.query_length,
636 start='length of query')
637 read_and_call(uhandle, consumer.database_length,
638 start='length of database')
639 read_and_call(uhandle, consumer.effective_hsp_length,
640 start='effective HSP')
641 attempt_read_and_call(uhandle, consumer.effective_query_length,
642 start='effective length of query')
643 read_and_call(uhandle, consumer.effective_database_length,
644 start='effective length of database')
645 attempt_read_and_call(uhandle, consumer.effective_search_space,
646 start='effective search space:')
647 attempt_read_and_call(uhandle, consumer.effective_search_space_used,
648 start='effective search space used')
649
650
651 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
652 attempt_read_and_call(uhandle, consumer.threshold, start='T')
653 read_and_call(uhandle, consumer.window_size, start='A')
654 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
655 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
656
657 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
658 start='X3')
659 read_and_call(uhandle, consumer.gap_trigger, start='S1')
660 attempt_read_and_call(uhandle, consumer.blast_cutoff, start='S2')
661
662 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
663 attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>")
664 attempt_read_and_call(uhandle, consumer.noevent, start="</form>")
665
666 consumer.end_parameters()
667
668 -def qblast(program, database, sequence,
669 auto_format=None,composition_based_statistics=None,
670 db_genetic_code=None,endpoints=None,entrez_query='(none)',
671 expect=10.0,filter=None,gapcosts=None,genetic_code=None,
672 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
673 matrix_name=None,nucl_penalty=None,nucl_reward=None,
674 other_advanced=None,perc_ident=None,phi_pattern=None,
675 query_file=None,query_believe_defline=None,query_from=None,
676 query_to=None,searchsp_eff=None,service=None,threshold=None,
677 ungapped_alignment=None,word_size=None,
678 alignments=500,alignment_view=None,descriptions=500,
679 entrez_links_new_window=None,expect_low=None,expect_high=None,
680 format_entrez_query=None,format_object=None,format_type='XML',
681 ncbi_gi=None,results_file=None,show_overview=None
682 ):
683 """Do a BLAST search using the QBLAST server at NCBI.
684
685 Supports all parameters of the qblast API for Put and Get.
686 Some useful parameters:
687 program blastn, blastp, blastx, tblastn, or tblastx (lower case)
688 database Which database to search against (e.g. "nr").
689 sequence The sequence to search.
690 ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
691 descriptions Number of descriptions to show. Def 500.
692 alignments Number of alignments to show. Def 500.
693 expect An expect value cutoff. Def 10.0.
694 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
695 filter "none" turns off filtering. Default no filtering
696 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
697 entrez_query Entrez query to limit Blast search
698 hitlist_size Number of hits to return. Default 50
699
700 This function does no checking of the validity of the parameters
701 and passes the values to the server as is. More help is available at:
702 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
703
704 """
705 import urllib, urllib2
706 from Bio.WWW import RequestLimiter
707
708 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
709
710
711
712 parameters = [
713 ('AUTO_FORMAT',auto_format),
714 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
715 ('DATABASE',database),
716 ('DB_GENETIC_CODE',db_genetic_code),
717 ('ENDPOINTS',endpoints),
718 ('ENTREZ_QUERY',entrez_query),
719 ('EXPECT',expect),
720 ('FILTER',filter),
721 ('GAPCOSTS',gapcosts),
722 ('GENETIC_CODE',genetic_code),
723 ('HITLIST_SIZE',hitlist_size),
724 ('I_THRESH',i_thresh),
725 ('LAYOUT',layout),
726 ('LCASE_MASK',lcase_mask),
727 ('MATRIX_NAME',matrix_name),
728 ('NUCL_PENALTY',nucl_penalty),
729 ('NUCL_REWARD',nucl_reward),
730 ('OTHER_ADVANCED',other_advanced),
731 ('PERC_IDENT',perc_ident),
732 ('PHI_PATTERN',phi_pattern),
733 ('PROGRAM',program),
734 ('QUERY',sequence),
735 ('QUERY_FILE',query_file),
736 ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
737 ('QUERY_FROM',query_from),
738 ('QUERY_TO',query_to),
739 ('SEARCHSP_EFF',searchsp_eff),
740 ('SERVICE',service),
741 ('THRESHOLD',threshold),
742 ('UNGAPPED_ALIGNMENT',ungapped_alignment),
743 ('WORD_SIZE',word_size),
744 ('CMD', 'Put'),
745 ]
746 query = [x for x in parameters if x[1] is not None]
747 message = urllib.urlencode(query)
748
749
750 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
751 message,
752 {"User-Agent":"BiopythonClient"})
753 handle = urllib2.urlopen(request)
754
755
756
757 rid, rtoe = _parse_qblast_ref_page(handle)
758 parameters = [
759 ('ALIGNMENTS',alignments),
760 ('ALIGNMENT_VIEW',alignment_view),
761 ('DESCRIPTIONS',descriptions),
762 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
763 ('EXPECT_LOW',expect_low),
764 ('EXPECT_HIGH',expect_high),
765 ('FORMAT_ENTREZ_QUERY',format_entrez_query),
766 ('FORMAT_OBJECT',format_object),
767 ('FORMAT_TYPE',format_type),
768 ('NCBI_GI',ncbi_gi),
769 ('RID',rid),
770 ('RESULTS_FILE',results_file),
771 ('SERVICE',service),
772 ('SHOW_OVERVIEW',show_overview),
773 ('CMD', 'Get'),
774 ]
775 query = [x for x in parameters if x[1] is not None]
776 message = urllib.urlencode(query)
777
778
779 limiter = RequestLimiter(3)
780 while 1:
781 limiter.wait()
782 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
783 message,
784 {"User-Agent":"BiopythonClient"})
785 handle = urllib2.urlopen(request)
786 results = handle.read()
787
788 if results.find("Status=") < 0:
789 break
790 i = results.index("Status=")
791 j = results.index("\n", i)
792 status = results[i+len("Status="):j].strip()
793 if status.upper() == "READY":
794 break
795
796 return StringIO.StringIO(results)
797
799 """Return tuple of RID, RTOE."""
800 s = handle.read()
801 i = s.find("RID =")
802 j = s.find("\n", i)
803 rid = s[i+len("RID ="):j].strip()
804
805 i = s.find("RTOE =")
806 j = s.find("\n", i)
807 rtoe = s[i+len("RTOE ="):j].strip()
808 return rid, int(rtoe)
809