Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8   
   9  """ 
  10  This module provides code to work with the standalone version of 
  11  BLAST, either blastall or blastpgp, provided by the NCBI. 
  12  http://www.ncbi.nlm.nih.gov/BLAST/ 
  13   
  14  Classes: 
  15  LowQualityBlastError     Except that indicates low quality query sequences. 
  16  BlastParser              Parses output from blast. 
  17  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  18  PSIBlastParser           Parses output from psi-blast. 
  19  Iterator                 Iterates over a file of blast results. 
  20   
  21  _Scanner                 Scans output from standalone BLAST. 
  22  _BlastConsumer           Consumes output from blast. 
  23  _PSIBlastConsumer        Consumes output from psi-blast. 
  24  _HeaderConsumer          Consumes header information. 
  25  _DescriptionConsumer     Consumes description information. 
  26  _AlignmentConsumer       Consumes alignment information. 
  27  _HSPConsumer             Consumes hsp information. 
  28  _DatabaseReportConsumer  Consumes database report information. 
  29  _ParametersConsumer      Consumes parameters information. 
  30   
  31  Functions: 
  32  blastall        Execute blastall. 
  33  blastpgp        Execute blastpgp. 
  34  rpsblast        Execute rpsblast. 
  35   
  36  """ 
  37   
  38  from __future__ import generators 
  39  import os 
  40  import re 
  41   
  42  from Bio import File 
  43  from Bio.ParserSupport import * 
  44  from Bio.Blast import Record 
  45   
  46   
47 -class LowQualityBlastError(Exception):
48 """Error caused by running a low quality sequence through BLAST. 49 50 When low quality sequences (like GenBank entries containing only 51 stretches of a single nucleotide) are BLASTed, they will result in 52 BLAST generating an error and not being able to perform the BLAST. 53 search. This error should be raised for the BLAST reports produced 54 in this case. 55 """ 56 pass
57
58 -class ShortQueryBlastError(Exception):
59 """Error caused by running a short query sequence through BLAST. 60 61 If the query sequence is too short, BLAST outputs warnings and errors: 62 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 63 [blastall] ERROR: [000.000] AT1G08320: Blast: 64 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 65 done 66 67 This exception is raised when that condition is detected. 68 69 """ 70 pass
71 72
73 -class _Scanner:
74 """Scan BLAST output from blastall or blastpgp. 75 76 Tested with blastall and blastpgp v2.0.10, v2.0.11 77 78 Methods: 79 feed Feed data into the scanner. 80 81 """
82 - def feed(self, handle, consumer):
83 """S.feed(handle, consumer) 84 85 Feed in a BLAST report for scanning. handle is a file-like 86 object that contains the BLAST report. consumer is a Consumer 87 object that will receive events as the report is scanned. 88 89 """ 90 if isinstance(handle, File.UndoHandle): 91 uhandle = handle 92 else: 93 uhandle = File.UndoHandle(handle) 94 95 # Try to fast-forward to the beginning of the blast report. 96 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 97 # Now scan the BLAST report. 98 self._scan_header(uhandle, consumer) 99 self._scan_rounds(uhandle, consumer) 100 self._scan_database_report(uhandle, consumer) 101 self._scan_parameters(uhandle, consumer)
102
103 - def _scan_header(self, uhandle, consumer):
104 # BLASTP 2.0.10 [Aug-26-1999] 105 # 106 # 107 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 108 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 109 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 110 # programs", Nucleic Acids Res. 25:3389-3402. 111 # 112 # Query= test 113 # (140 letters) 114 # 115 # Database: sdqib40-1.35.seg.fa 116 # 1323 sequences; 223,339 total letters 117 # 118 # ======================================================== 119 # This next example is from the online version of Blast, 120 # note there are TWO references, an RID line, and also 121 # the database is BEFORE the query line. 122 # Note there possibleuse of non-ASCII in the author names. 123 # ======================================================== 124 # 125 # BLASTP 2.2.15 [Oct-15-2006] 126 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 127 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 128 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 129 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 130 # 131 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 132 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 133 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 134 # protein database searches with composition-based statistics 135 # and other refinements", Nucleic Acids Res. 29:2994-3005. 136 # 137 # RID: 1166022616-19998-65316425856.BLASTQ1 138 # 139 # 140 # Database: All non-redundant GenBank CDS 141 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 142 # 4,254,166 sequences; 1,462,033,012 total letters 143 # Query= gi:16127998 144 # Length=428 145 # 146 147 consumer.start_header() 148 149 read_and_call(uhandle, consumer.version, contains='BLAST') 150 read_and_call_while(uhandle, consumer.noevent, blank=1) 151 152 # There might be a <pre> line, for qblast output. 153 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 154 155 # Read the reference(s) 156 while attempt_read_and_call(uhandle, 157 consumer.reference, start='Reference') : 158 # References are normally multiline terminated by a blank line 159 # (or, based on the old code, the RID line) 160 while 1: 161 line = uhandle.readline() 162 if is_blank_line(line) : 163 consumer.noevent(line) 164 break 165 elif line.startswith("RID"): 166 break 167 else : 168 #More of the reference 169 consumer.reference(line) 170 171 #Deal with the optional RID: ... 172 read_and_call_while(uhandle, consumer.noevent, blank=1) 173 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 174 read_and_call_while(uhandle, consumer.noevent, blank=1) 175 176 # blastpgp may have a reference for compositional score matrix 177 # adjustment (see Bug 2502): 178 if attempt_read_and_call( 179 uhandle, consumer.reference, start="Reference"): 180 read_and_call_until(uhandle, consumer.reference, blank=1) 181 read_and_call_while(uhandle, consumer.noevent, blank=1) 182 183 # blastpgp has a Reference for composition-based statistics. 184 if attempt_read_and_call( 185 uhandle, consumer.reference, start="Reference"): 186 read_and_call_until(uhandle, consumer.reference, blank=1) 187 read_and_call_while(uhandle, consumer.noevent, blank=1) 188 189 line = uhandle.peekline() 190 assert line.strip() <> "" 191 assert not line.startswith("RID:") 192 if line.startswith("Query=") : 193 #This is an old style query then database... 194 195 # Read the Query lines and the following blank line. 196 read_and_call(uhandle, consumer.query_info, start='Query=') 197 read_and_call_until(uhandle, consumer.query_info, blank=1) 198 read_and_call_while(uhandle, consumer.noevent, blank=1) 199 200 # Read the database lines and the following blank line. 201 read_and_call_until(uhandle, consumer.database_info, end='total letters') 202 read_and_call(uhandle, consumer.database_info, contains='sequences') 203 read_and_call_while(uhandle, consumer.noevent, blank=1) 204 elif line.startswith("Database:") : 205 #This is a new style database then query... 206 read_and_call_until(uhandle, consumer.database_info, end='total letters') 207 read_and_call(uhandle, consumer.database_info, contains='sequences') 208 read_and_call_while(uhandle, consumer.noevent, blank=1) 209 210 # Read the Query lines and the following blank line. 211 read_and_call(uhandle, consumer.query_info, start='Query=') 212 read_and_call_until(uhandle, consumer.query_info, blank=1) 213 read_and_call_while(uhandle, consumer.noevent, blank=1) 214 else : 215 raise ValueError("Invalid header?") 216 217 consumer.end_header()
218
219 - def _scan_rounds(self, uhandle, consumer):
220 # Scan a bunch of rounds. 221 # Each round begins with either a "Searching......" line 222 # or a 'Score E' line followed by descriptions and alignments. 223 # The email server doesn't give the "Searching....." line. 224 # If there is no 'Searching.....' line then you'll first see a 225 # 'Results from round' line 226 227 while 1: 228 line = safe_peekline(uhandle) 229 if (not line.startswith('Searching') and 230 not line.startswith('Results from round') and 231 re.search(r"Score +E", line) is None and 232 line.find('No hits found') == -1): 233 break 234 235 self._scan_descriptions(uhandle, consumer) 236 self._scan_alignments(uhandle, consumer)
237
238 - def _scan_descriptions(self, uhandle, consumer):
239 # Searching..................................................done 240 # Results from round 2 241 # 242 # 243 # Sc 244 # Sequences producing significant alignments: (b 245 # Sequences used in model and found again: 246 # 247 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 248 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 249 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 250 # 251 # Sequences not found previously or not previously below threshold: 252 # 253 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 254 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 255 # 256 257 # If PSI-BLAST, may also have: 258 # 259 # CONVERGED! 260 261 consumer.start_descriptions() 262 263 # Read 'Searching' 264 # This line seems to be missing in BLASTN 2.1.2 (others?) 265 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 266 267 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 268 # If this happens, the handle will yield no more information. 269 if not uhandle.peekline(): 270 raise ValueError, "Unexpected end of blast report. " + \ 271 "Looks suspiciously like a PSI-BLAST crash." 272 273 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 274 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 275 # [blastall] ERROR: [000.000] AT1G08320: Blast: 276 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 277 # done 278 # Reported by David Weisman. 279 # Check for these error lines and ignore them for now. Let 280 # the BlastErrorParser deal with them. 281 line = uhandle.peekline() 282 if line.find("ERROR:") != -1 or line.startswith("done"): 283 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 284 read_and_call(uhandle, consumer.noevent, start="done") 285 286 # Check to see if this is PSI-BLAST. 287 # If it is, the 'Searching' line will be followed by: 288 # (version 2.0.10) 289 # Searching............................. 290 # Results from round 2 291 # or (version 2.0.11) 292 # Searching............................. 293 # 294 # 295 # Results from round 2 296 297 # Skip a bunch of blank lines. 298 read_and_call_while(uhandle, consumer.noevent, blank=1) 299 # Check for the results line if it's there. 300 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 301 read_and_call_while(uhandle, consumer.noevent, blank=1) 302 303 # Three things can happen here: 304 # 1. line contains 'Score E' 305 # 2. line contains "No hits found" 306 # 3. no descriptions 307 # The first one begins a bunch of descriptions. The last two 308 # indicates that no descriptions follow, and we should go straight 309 # to the alignments. 310 if not attempt_read_and_call( 311 uhandle, consumer.description_header, 312 has_re=re.compile(r'Score +E')): 313 # Either case 2 or 3. Look for "No hits found". 314 attempt_read_and_call(uhandle, consumer.no_hits, 315 contains='No hits found') 316 read_and_call_while(uhandle, consumer.noevent, blank=1) 317 318 consumer.end_descriptions() 319 # Stop processing. 320 return 321 322 # Read the score header lines 323 read_and_call(uhandle, consumer.description_header, 324 start='Sequences producing') 325 326 # If PSI-BLAST, read the 'Sequences used in model' line. 327 attempt_read_and_call(uhandle, consumer.model_sequences, 328 start='Sequences used in model') 329 read_and_call_while(uhandle, consumer.noevent, blank=1) 330 331 # Read the descriptions and the following blank lines, making 332 # sure that there are descriptions. 333 if not uhandle.peekline().startswith('Sequences not found'): 334 read_and_call_until(uhandle, consumer.description, blank=1) 335 read_and_call_while(uhandle, consumer.noevent, blank=1) 336 337 # If PSI-BLAST, read the 'Sequences not found' line followed 338 # by more descriptions. However, I need to watch out for the 339 # case where there were no sequences not found previously, in 340 # which case there will be no more descriptions. 341 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 342 start='Sequences not found'): 343 # Read the descriptions and the following blank lines. 344 read_and_call_while(uhandle, consumer.noevent, blank=1) 345 l = safe_peekline(uhandle) 346 # Brad -- added check for QUERY. On some PSI-BLAST outputs 347 # there will be a 'Sequences not found' line followed by no 348 # descriptions. Check for this case since the first thing you'll 349 # get is a blank line and then 'QUERY' 350 if not l.startswith('CONVERGED') and l[0] != '>' \ 351 and not l.startswith('QUERY'): 352 read_and_call_until(uhandle, consumer.description, blank=1) 353 read_and_call_while(uhandle, consumer.noevent, blank=1) 354 355 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 356 read_and_call_while(uhandle, consumer.noevent, blank=1) 357 358 consumer.end_descriptions()
359
360 - def _scan_alignments(self, uhandle, consumer):
361 # qblast inserts a helpful line here. 362 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 363 364 # First, check to see if I'm at the database report. 365 line = safe_peekline(uhandle) 366 if line.startswith(' Database'): 367 return 368 elif line[0] == '>': 369 # XXX make a better check here between pairwise and masterslave 370 self._scan_pairwise_alignments(uhandle, consumer) 371 else: 372 # XXX put in a check to make sure I'm in a masterslave alignment 373 self._scan_masterslave_alignment(uhandle, consumer)
374
375 - def _scan_pairwise_alignments(self, uhandle, consumer):
376 while 1: 377 line = safe_peekline(uhandle) 378 if line[0] != '>': 379 break 380 self._scan_one_pairwise_alignment(uhandle, consumer)
381
382 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
383 consumer.start_alignment() 384 385 self._scan_alignment_header(uhandle, consumer) 386 387 # Scan a bunch of score/alignment pairs. 388 while 1: 389 line = safe_peekline(uhandle) 390 if not line.startswith(' Score'): 391 break 392 self._scan_hsp(uhandle, consumer) 393 consumer.end_alignment()
394
395 - def _scan_alignment_header(self, uhandle, consumer):
396 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 397 # stearothermophilus] 398 # Length = 81 399 # 400 # Or, more recently with different white space: 401 # 402 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 403 # gi|15829258|ref|NP_308031.1| threonine synthase 404 # ... 405 # Length=428 406 read_and_call(uhandle, consumer.title, start='>') 407 while 1: 408 line = safe_readline(uhandle) 409 if line.lstrip().startswith('Length =') \ 410 or line.lstrip().startswith('Length='): 411 consumer.length(line) 412 break 413 elif is_blank_line(line): 414 # Check to make sure I haven't missed the Length line 415 raise ValueError, "I missed the Length in an alignment header" 416 consumer.title(line) 417 418 # Older versions of BLAST will have a line with some spaces. 419 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 420 if not attempt_read_and_call(uhandle, consumer.noevent, 421 start=' '): 422 read_and_call(uhandle, consumer.noevent, blank=1)
423
424 - def _scan_hsp(self, uhandle, consumer):
425 consumer.start_hsp() 426 self._scan_hsp_header(uhandle, consumer) 427 self._scan_hsp_alignment(uhandle, consumer) 428 consumer.end_hsp()
429
430 - def _scan_hsp_header(self, uhandle, consumer):
431 # Score = 22.7 bits (47), Expect = 2.5 432 # Identities = 10/36 (27%), Positives = 18/36 (49%) 433 # Strand = Plus / Plus 434 # Frame = +3 435 # 436 437 read_and_call(uhandle, consumer.score, start=' Score') 438 read_and_call(uhandle, consumer.identities, start=' Identities') 439 # BLASTN 440 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 441 # BLASTX, TBLASTN, TBLASTX 442 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 443 read_and_call(uhandle, consumer.noevent, blank=1)
444
445 - def _scan_hsp_alignment(self, uhandle, consumer):
446 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 447 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 448 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 449 # 450 # Query: 64 AEKILIKR 71 451 # I +K 452 # Sbjct: 70 PNIIQLKD 77 453 # 454 455 while 1: 456 # Blastn adds an extra line filled with spaces before Query 457 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 458 read_and_call(uhandle, consumer.query, start='Query') 459 read_and_call(uhandle, consumer.align, start=' ') 460 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 461 read_and_call_while(uhandle, consumer.noevent, blank=1) 462 line = safe_peekline(uhandle) 463 # Alignment continues if I see a 'Query' or the spaces for Blastn. 464 if not (line.startswith('Query') or line.startswith(' ')): 465 break
466
467 - def _scan_masterslave_alignment(self, uhandle, consumer):
468 consumer.start_alignment() 469 while 1: 470 line = safe_readline(uhandle) 471 # Check to see whether I'm finished reading the alignment. 472 # This is indicated by 1) database section, 2) next psi-blast 473 # round, which can also be a 'Results from round' if no 474 # searching line is present 475 # patch by chapmanb 476 if line.startswith('Searching') or \ 477 line.startswith('Results from round'): 478 uhandle.saveline(line) 479 break 480 elif line.startswith(' Database'): 481 uhandle.saveline(line) 482 break 483 elif is_blank_line(line): 484 consumer.noevent(line) 485 else: 486 consumer.multalign(line) 487 read_and_call_while(uhandle, consumer.noevent, blank=1) 488 consumer.end_alignment()
489
490 - def _scan_database_report(self, uhandle, consumer):
491 # Database: sdqib40-1.35.seg.fa 492 # Posted date: Nov 1, 1999 4:25 PM 493 # Number of letters in database: 223,339 494 # Number of sequences in database: 1323 495 # 496 # Lambda K H 497 # 0.322 0.133 0.369 498 # 499 # Gapped 500 # Lambda K H 501 # 0.270 0.0470 0.230 502 # 503 ########################################## 504 # Or, more recently Blast 2.2.15 gives less blank lines 505 ########################################## 506 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 507 # environmental samples 508 # Posted date: Dec 12, 2006 5:51 PM 509 # Number of letters in database: 667,088,753 510 # Number of sequences in database: 2,094,974 511 # Lambda K H 512 # 0.319 0.136 0.395 513 # Gapped 514 # Lambda K H 515 # 0.267 0.0410 0.140 516 517 518 consumer.start_database_report() 519 520 # Subset of the database(s) listed below 521 # Number of letters searched: 562,618,960 522 # Number of sequences searched: 228,924 523 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 524 read_and_call(uhandle, consumer.noevent, contains="letters") 525 read_and_call(uhandle, consumer.noevent, contains="sequences") 526 read_and_call(uhandle, consumer.noevent, start=" ") 527 528 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 529 # was missing the "Database" stanza completely. 530 while attempt_read_and_call(uhandle, consumer.database, 531 start=' Database'): 532 # BLAT output ends abruptly here, without any of the other 533 # information. Check to see if this is the case. If so, 534 # then end the database report here gracefully. 535 if not uhandle.peekline(): 536 consumer.end_database_report() 537 return 538 539 # Database can span multiple lines. 540 read_and_call_until(uhandle, consumer.database, start=' Posted') 541 read_and_call(uhandle, consumer.posted_date, start=' Posted') 542 read_and_call(uhandle, consumer.num_letters_in_database, 543 start=' Number of letters') 544 read_and_call(uhandle, consumer.num_sequences_in_database, 545 start=' Number of sequences') 546 #There may not be a line starting with spaces... 547 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 548 549 line = safe_readline(uhandle) 550 uhandle.saveline(line) 551 if line.find('Lambda') != -1: 552 break 553 554 read_and_call(uhandle, consumer.noevent, start='Lambda') 555 read_and_call(uhandle, consumer.ka_params) 556 557 #This blank line is optional: 558 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 559 560 # not BLASTP 561 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 562 # not TBLASTX 563 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 564 read_and_call(uhandle, consumer.ka_params_gap) 565 566 # Blast 2.2.4 can sometimes skip the whole parameter section. 567 # Thus, I need to be careful not to read past the end of the 568 # file. 569 try: 570 read_and_call_while(uhandle, consumer.noevent, blank=1) 571 except ValueError, x: 572 if str(x) != "Unexpected end of stream.": 573 raise 574 consumer.end_database_report()
575
576 - def _scan_parameters(self, uhandle, consumer):
577 # Matrix: BLOSUM62 578 # Gap Penalties: Existence: 11, Extension: 1 579 # Number of Hits to DB: 50604 580 # Number of Sequences: 1323 581 # Number of extensions: 1526 582 # Number of successful extensions: 6 583 # Number of sequences better than 10.0: 5 584 # Number of HSP's better than 10.0 without gapping: 5 585 # Number of HSP's successfully gapped in prelim test: 0 586 # Number of HSP's that attempted gapping in prelim test: 1 587 # Number of HSP's gapped (non-prelim): 5 588 # length of query: 140 589 # length of database: 223,339 590 # effective HSP length: 39 591 # effective length of query: 101 592 # effective length of database: 171,742 593 # effective search space: 17345942 594 # effective search space used: 17345942 595 # T: 11 596 # A: 40 597 # X1: 16 ( 7.4 bits) 598 # X2: 38 (14.8 bits) 599 # X3: 64 (24.9 bits) 600 # S1: 41 (21.9 bits) 601 # S2: 42 (20.8 bits) 602 ########################################## 603 # Or, more recently Blast(x) 2.2.15 gives 604 ########################################## 605 # Matrix: BLOSUM62 606 # Gap Penalties: Existence: 11, Extension: 1 607 # Number of Sequences: 4535438 608 # Number of Hits to DB: 2,588,844,100 609 # Number of extensions: 60427286 610 # Number of successful extensions: 126433 611 # Number of sequences better than 2.0: 30 612 # Number of HSP's gapped: 126387 613 # Number of HSP's successfully gapped: 35 614 # Length of query: 291 615 # Length of database: 1,573,298,872 616 # Length adjustment: 130 617 # Effective length of query: 161 618 # Effective length of database: 983,691,932 619 # Effective search space: 158374401052 620 # Effective search space used: 158374401052 621 # Neighboring words threshold: 12 622 # Window for multiple hits: 40 623 # X1: 16 ( 7.3 bits) 624 # X2: 38 (14.6 bits) 625 # X3: 64 (24.7 bits) 626 # S1: 41 (21.7 bits) 627 # S2: 32 (16.9 bits) 628 629 630 # Blast 2.2.4 can sometimes skip the whole parameter section. 631 # Thus, check to make sure that the parameter section really 632 # exists. 633 if not uhandle.peekline(): 634 return 635 636 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 637 # "Number of Sequences" lines. 638 consumer.start_parameters() 639 640 # Matrix line may be missing in BLASTN 2.2.9 641 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 642 # not TBLASTX 643 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 644 645 attempt_read_and_call(uhandle, consumer.num_sequences, 646 start='Number of Sequences') 647 read_and_call(uhandle, consumer.num_hits, 648 start='Number of Hits') 649 attempt_read_and_call(uhandle, consumer.num_sequences, 650 start='Number of Sequences') 651 read_and_call(uhandle, consumer.num_extends, 652 start='Number of extensions') 653 read_and_call(uhandle, consumer.num_good_extends, 654 start='Number of successful') 655 656 read_and_call(uhandle, consumer.num_seqs_better_e, 657 start='Number of sequences') 658 659 # not BLASTN, TBLASTX 660 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 661 start="Number of HSP's better"): 662 # BLASTN 2.2.9 663 if attempt_read_and_call(uhandle, consumer.noevent, 664 start="Number of HSP's gapped:"): 665 read_and_call(uhandle, consumer.noevent, 666 start="Number of HSP's successfully") 667 #This is ommitted in 2.2.15 668 attempt_read_and_call(uhandle, consumer.noevent, 669 start="Number of extra gapped extensions") 670 else: 671 read_and_call(uhandle, consumer.hsps_prelim_gapped, 672 start="Number of HSP's successfully") 673 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 674 start="Number of HSP's that") 675 read_and_call(uhandle, consumer.hsps_gapped, 676 start="Number of HSP's gapped") 677 #e.g. BLASTX 2.2.15 where the "better" line is missing 678 elif attempt_read_and_call(uhandle, consumer.noevent, 679 start="Number of HSP's gapped"): 680 read_and_call(uhandle, consumer.noevent, 681 start="Number of HSP's successfully") 682 683 # not in blastx 2.2.1 684 attempt_read_and_call(uhandle, consumer.query_length, 685 has_re=re.compile(r"[Ll]ength of query")) 686 read_and_call(uhandle, consumer.database_length, 687 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 688 689 # BLASTN 2.2.9 690 attempt_read_and_call(uhandle, consumer.noevent, 691 start="Length adjustment") 692 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 693 start='effective HSP') 694 # Not in blastx 2.2.1 695 attempt_read_and_call( 696 uhandle, consumer.effective_query_length, 697 has_re=re.compile(r'[Ee]ffective length of query')) 698 699 # This is not in BLASTP 2.2.15 700 attempt_read_and_call( 701 uhandle, consumer.effective_database_length, 702 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 703 # Not in blastx 2.2.1, added a ':' to distinguish between 704 # this and the 'effective search space used' line 705 attempt_read_and_call( 706 uhandle, consumer.effective_search_space, 707 has_re=re.compile(r'[Ee]ffective search space:')) 708 # Does not appear in BLASTP 2.0.5 709 attempt_read_and_call( 710 uhandle, consumer.effective_search_space_used, 711 has_re=re.compile(r'[Ee]ffective search space used')) 712 713 # BLASTX, TBLASTN, TBLASTX 714 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 715 716 # not in BLASTN 2.2.9 717 attempt_read_and_call(uhandle, consumer.threshold, start='T') 718 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 719 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 720 721 # not in BLASTX 2.2.15 722 attempt_read_and_call(uhandle, consumer.window_size, start='A') 723 # get this instead: "Window for multiple hits: 40" 724 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 725 726 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 727 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 728 729 # not BLASTN, TBLASTX 730 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 731 start='X3') 732 733 read_and_call(uhandle, consumer.gap_trigger, start='S1') 734 # not in blastx 2.2.1 735 # first we make sure we have additional lines to work with, if 736 # not then the file is done and we don't have a final S2 737 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 738 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 739 740 consumer.end_parameters()
741
742 -class BlastParser(AbstractParser):
743 """Parses BLAST data into a Record.Blast object. 744 745 """
746 - def __init__(self):
747 """__init__(self)""" 748 self._scanner = _Scanner() 749 self._consumer = _BlastConsumer()
750
751 - def parse(self, handle):
752 """parse(self, handle)""" 753 self._scanner.feed(handle, self._consumer) 754 return self._consumer.data
755
756 -class PSIBlastParser(AbstractParser):
757 """Parses BLAST data into a Record.PSIBlast object. 758 759 """
760 - def __init__(self):
761 """__init__(self)""" 762 self._scanner = _Scanner() 763 self._consumer = _PSIBlastConsumer()
764
765 - def parse(self, handle):
766 """parse(self, handle)""" 767 self._scanner.feed(handle, self._consumer) 768 return self._consumer.data
769
770 -class _HeaderConsumer:
771 - def start_header(self):
772 self._header = Record.Header()
773
774 - def version(self, line):
775 c = line.split() 776 self._header.application = c[0] 777 self._header.version = c[1] 778 self._header.date = c[2][1:-1]
779
780 - def reference(self, line):
781 if line.startswith('Reference: '): 782 self._header.reference = line[11:] 783 else: 784 self._header.reference = self._header.reference + line
785
786 - def query_info(self, line):
787 if line.startswith('Query= '): 788 self._header.query = line[7:] 789 elif not line.startswith(' '): # continuation of query_info 790 self._header.query = "%s%s" % (self._header.query, line) 791 else: 792 letters, = _re_search( 793 r"([0-9,]+) letters", line, 794 "I could not find the number of letters in line\n%s" % line) 795 self._header.query_letters = _safe_int(letters)
796
797 - def database_info(self, line):
798 line = line.rstrip() 799 if line.startswith('Database: '): 800 self._header.database = line[10:] 801 elif not line.endswith('total letters'): 802 self._header.database = self._header.database + line.strip() 803 else: 804 sequences, letters =_re_search( 805 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 806 "I could not find the sequences and letters in line\n%s" %line) 807 self._header.database_sequences = _safe_int(sequences) 808 self._header.database_letters = _safe_int(letters)
809
810 - def end_header(self):
811 # Get rid of the trailing newlines 812 self._header.reference = self._header.reference.rstrip() 813 self._header.query = self._header.query.rstrip()
814
815 -class _DescriptionConsumer:
816 - def start_descriptions(self):
817 self._descriptions = [] 818 self._model_sequences = [] 819 self._nonmodel_sequences = [] 820 self._converged = 0 821 self._type = None 822 self._roundnum = None 823 824 self.__has_n = 0 # Does the description line contain an N value?
825
826 - def description_header(self, line):
827 if line.startswith('Sequences producing'): 828 cols = line.split() 829 if cols[-1] == 'N': 830 self.__has_n = 1
831
832 - def description(self, line):
833 dh = self._parse(line) 834 if self._type == 'model': 835 self._model_sequences.append(dh) 836 elif self._type == 'nonmodel': 837 self._nonmodel_sequences.append(dh) 838 else: 839 self._descriptions.append(dh)
840
841 - def model_sequences(self, line):
842 self._type = 'model'
843
844 - def nonmodel_sequences(self, line):
845 self._type = 'nonmodel'
846
847 - def converged(self, line):
848 self._converged = 1
849
850 - def no_hits(self, line):
851 pass
852
853 - def round(self, line):
854 if not line.startswith('Results from round'): 855 raise ValueError, "I didn't understand the round line\n%s" % line 856 self._roundnum = _safe_int(line[18:].strip())
857
858 - def end_descriptions(self):
859 pass
860
861 - def _parse(self, description_line):
862 line = description_line # for convenience 863 dh = Record.Description() 864 865 # I need to separate the score and p-value from the title. 866 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 867 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 868 # special cases to handle: 869 # - title must be preserved exactly (including whitespaces) 870 # - score could be equal to e-value (not likely, but what if??) 871 # - sometimes there's an "N" score of '1'. 872 cols = line.split() 873 if len(cols) < 3: 874 raise ValueError, \ 875 "Line does not appear to contain description:\n%s" % line 876 if self.__has_n: 877 i = line.rfind(cols[-1]) # find start of N 878 i = line.rfind(cols[-2], 0, i) # find start of p-value 879 i = line.rfind(cols[-3], 0, i) # find start of score 880 else: 881 i = line.rfind(cols[-1]) # find start of p-value 882 i = line.rfind(cols[-2], 0, i) # find start of score 883 if self.__has_n: 884 dh.title, dh.score, dh.e, dh.num_alignments = \ 885 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 886 else: 887 dh.title, dh.score, dh.e, dh.num_alignments = \ 888 line[:i].rstrip(), cols[-2], cols[-1], 1 889 dh.num_alignments = _safe_int(dh.num_alignments) 890 dh.score = _safe_int(dh.score) 891 dh.e = _safe_float(dh.e) 892 return dh
893
894 -class _AlignmentConsumer:
895 # This is a little bit tricky. An alignment can either be a 896 # pairwise alignment or a multiple alignment. Since it's difficult 897 # to know a-priori which one the blast record will contain, I'm going 898 # to make one class that can parse both of them.
899 - def start_alignment(self):
900 self._alignment = Record.Alignment() 901 self._multiple_alignment = Record.MultipleAlignment()
902
903 - def title(self, line):
904 self._alignment.title = "%s%s" % (self._alignment.title, 905 line.lstrip())
906
907 - def length(self, line):
908 #e.g. "Length = 81" or more recently, "Length=428" 909 parts = line.replace(" ","").split("=") 910 assert len(parts)==2, "Unrecognised format length line" 911 self._alignment.length = parts[1] 912 self._alignment.length = _safe_int(self._alignment.length)
913
914 - def multalign(self, line):
915 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 916 if line.startswith('QUERY') or line.startswith('blast_tmp'): 917 # If this is the first line of the multiple alignment, 918 # then I need to figure out how the line is formatted. 919 920 # Format of line is: 921 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 922 try: 923 name, start, seq, end = line.split() 924 except ValueError: 925 raise ValueError, "I do not understand the line\n%s" \ 926 % line 927 self._start_index = line.index(start, len(name)) 928 self._seq_index = line.index(seq, 929 self._start_index+len(start)) 930 # subtract 1 for the space 931 self._name_length = self._start_index - 1 932 self._start_length = self._seq_index - self._start_index - 1 933 self._seq_length = line.rfind(end) - self._seq_index - 1 934 935 #self._seq_index = line.index(seq) 936 ## subtract 1 for the space 937 #self._seq_length = line.rfind(end) - self._seq_index - 1 938 #self._start_index = line.index(start) 939 #self._start_length = self._seq_index - self._start_index - 1 940 #self._name_length = self._start_index 941 942 # Extract the information from the line 943 name = line[:self._name_length] 944 name = name.rstrip() 945 start = line[self._start_index:self._start_index+self._start_length] 946 start = start.rstrip() 947 if start: 948 start = _safe_int(start) 949 end = line[self._seq_index+self._seq_length:].rstrip() 950 if end: 951 end = _safe_int(end) 952 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 953 # right pad the sequence with spaces if necessary 954 if len(seq) < self._seq_length: 955 seq = seq + ' '*(self._seq_length-len(seq)) 956 957 # I need to make sure the sequence is aligned correctly with the query. 958 # First, I will find the length of the query. Then, if necessary, 959 # I will pad my current sequence with spaces so that they will line 960 # up correctly. 961 962 # Two possible things can happen: 963 # QUERY 964 # 504 965 # 966 # QUERY 967 # 403 968 # 969 # Sequence 504 will need padding at the end. Since I won't know 970 # this until the end of the alignment, this will be handled in 971 # end_alignment. 972 # Sequence 403 will need padding before being added to the alignment. 973 974 align = self._multiple_alignment.alignment # for convenience 975 align.append((name, start, seq, end))
976 977 # This is old code that tried to line up all the sequences 978 # in a multiple alignment by using the sequence title's as 979 # identifiers. The problem with this is that BLAST assigns 980 # different HSP's from the same sequence the same id. Thus, 981 # in one alignment block, there may be multiple sequences with 982 # the same id. I'm not sure how to handle this, so I'm not 983 # going to. 984 985 # # If the sequence is the query, then just add it. 986 # if name == 'QUERY': 987 # if len(align) == 0: 988 # align.append((name, start, seq)) 989 # else: 990 # aname, astart, aseq = align[0] 991 # if name != aname: 992 # raise ValueError, "Query is not the first sequence" 993 # aseq = aseq + seq 994 # align[0] = aname, astart, aseq 995 # else: 996 # if len(align) == 0: 997 # raise ValueError, "I could not find the query sequence" 998 # qname, qstart, qseq = align[0] 999 # 1000 # # Now find my sequence in the multiple alignment. 1001 # for i in range(1, len(align)): 1002 # aname, astart, aseq = align[i] 1003 # if name == aname: 1004 # index = i 1005 # break 1006 # else: 1007 # # If I couldn't find it, then add a new one. 1008 # align.append((None, None, None)) 1009 # index = len(align)-1 1010 # # Make sure to left-pad it. 1011 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1012 # 1013 # if len(qseq) != len(aseq) + len(seq): 1014 # # If my sequences are shorter than the query sequence, 1015 # # then I will need to pad some spaces to make them line up. 1016 # # Since I've already right padded seq, that means aseq 1017 # # must be too short. 1018 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1019 # aseq = aseq + seq 1020 # if astart is None: 1021 # astart = start 1022 # align[index] = aname, astart, aseq 1023
1024 - def end_alignment(self):
1025 # Remove trailing newlines 1026 if self._alignment: 1027 self._alignment.title = self._alignment.title.rstrip() 1028 1029 # This code is also obsolete. See note above. 1030 # If there's a multiple alignment, I will need to make sure 1031 # all the sequences are aligned. That is, I may need to 1032 # right-pad the sequences. 1033 # if self._multiple_alignment is not None: 1034 # align = self._multiple_alignment.alignment 1035 # seqlen = None 1036 # for i in range(len(align)): 1037 # name, start, seq = align[i] 1038 # if seqlen is None: 1039 # seqlen = len(seq) 1040 # else: 1041 # if len(seq) < seqlen: 1042 # seq = seq + ' '*(seqlen - len(seq)) 1043 # align[i] = name, start, seq 1044 # elif len(seq) > seqlen: 1045 # raise ValueError, \ 1046 # "Sequence %s is longer than the query" % name 1047 1048 # Clean up some variables, if they exist. 1049 try: 1050 del self._seq_index 1051 del self._seq_length 1052 del self._start_index 1053 del self._start_length 1054 del self._name_length 1055 except AttributeError: 1056 pass
1057
1058 -class _HSPConsumer:
1059 - def start_hsp(self):
1060 self._hsp = Record.HSP()
1061
1062 - def score(self, line):
1063 self._hsp.bits, self._hsp.score = _re_search( 1064 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1065 "I could not find the score in line\n%s" % line) 1066 self._hsp.score = _safe_float(self._hsp.score) 1067 self._hsp.bits = _safe_float(self._hsp.bits) 1068 1069 x, y = _re_search( 1070 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1071 "I could not find the expect in line\n%s" % line) 1072 if x: 1073 self._hsp.num_alignments = _safe_int(x) 1074 else: 1075 self._hsp.num_alignments = 1 1076 self._hsp.expect = _safe_float(y)
1077
1078 - def identities(self, line):
1079 x, y = _re_search( 1080 r"Identities = (\d+)\/(\d+)", line, 1081 "I could not find the identities in line\n%s" % line) 1082 self._hsp.identities = _safe_int(x), _safe_int(y) 1083 1084 if line.find('Positives') != -1: 1085 x, y = _re_search( 1086 r"Positives = (\d+)\/(\d+)", line, 1087 "I could not find the positives in line\n%s" % line) 1088 self._hsp.positives = _safe_int(x), _safe_int(y) 1089 1090 if line.find('Gaps') != -1: 1091 x, y = _re_search( 1092 r"Gaps = (\d+)\/(\d+)", line, 1093 "I could not find the gaps in line\n%s" % line) 1094 self._hsp.gaps = _safe_int(x), _safe_int(y)
1095 1096
1097 - def strand(self, line):
1098 self._hsp.strand = _re_search( 1099 r"Strand = (\w+) / (\w+)", line, 1100 "I could not find the strand in line\n%s" % line)
1101
1102 - def frame(self, line):
1103 # Frame can be in formats: 1104 # Frame = +1 1105 # Frame = +2 / +2 1106 if line.find('/') != -1: 1107 self._hsp.frame = _re_search( 1108 r"Frame = ([-+][123]) / ([-+][123])", line, 1109 "I could not find the frame in line\n%s" % line) 1110 else: 1111 self._hsp.frame = _re_search( 1112 r"Frame = ([-+][123])", line, 1113 "I could not find the frame in line\n%s" % line)
1114 1115 # Match a space, if one is available. Masahir Ishikawa found a 1116 # case where there's no space between the start and the sequence: 1117 # Query: 100tt 101 1118 # line below modified by Yair Benita, Sep 2004 1119 # Note that the colon is not always present. 2006 1120 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1121 - def query(self, line):
1122 m = self._query_re.search(line) 1123 if m is None: 1124 raise ValueError, "I could not find the query in line\n%s" % line 1125 1126 # line below modified by Yair Benita, Sep 2004. 1127 # added the end attribute for the query 1128 colon, start, seq, end = m.groups() 1129 self._hsp.query = self._hsp.query + seq 1130 if self._hsp.query_start is None: 1131 self._hsp.query_start = _safe_int(start) 1132 1133 # line below added by Yair Benita, Sep 2004. 1134 # added the end attribute for the query 1135 self._hsp.query_end = _safe_int(end) 1136 1137 #Get index for sequence start (regular expression element 3) 1138 self._query_start_index = m.start(3) 1139 self._query_len = len(seq)
1140
1141 - def align(self, line):
1142 seq = line[self._query_start_index:].rstrip() 1143 if len(seq) < self._query_len: 1144 # Make sure the alignment is the same length as the query 1145 seq = seq + ' ' * (self._query_len-len(seq)) 1146 elif len(seq) < self._query_len: 1147 raise ValueError, "Match is longer than the query in line\n%s" % \ 1148 line 1149 self._hsp.match = self._hsp.match + seq
1150 1151 # To match how we do the query, cache the regular expression. 1152 # Note that the colon is not always present. 1153 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1154 - def sbjct(self, line):
1155 m = self._sbjct_re.search(line) 1156 if m is None: 1157 raise ValueError, "I could not find the sbjct in line\n%s" % line 1158 colon, start, seq, end = m.groups() 1159 #mikep 26/9/00 1160 #On occasion, there is a blast hit with no subject match 1161 #so far, it only occurs with 1-line short "matches" 1162 #I have decided to let these pass as they appear 1163 if not seq.strip(): 1164 seq = ' ' * self._query_len 1165 self._hsp.sbjct = self._hsp.sbjct + seq 1166 if self._hsp.sbjct_start is None: 1167 self._hsp.sbjct_start = _safe_int(start) 1168 1169 self._hsp.sbjct_end = _safe_int(end) 1170 if len(seq) != self._query_len: 1171 raise ValueError, \ 1172 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1173 % line 1174 1175 del self._query_start_index # clean up unused variables 1176 del self._query_len
1177
1178 - def end_hsp(self):
1179 pass
1180
1181 -class _DatabaseReportConsumer:
1182
1183 - def start_database_report(self):
1184 self._dr = Record.DatabaseReport()
1185
1186 - def database(self, line):
1187 m = re.search(r"Database: (.+)$", line) 1188 if m: 1189 self._dr.database_name.append(m.group(1)) 1190 elif self._dr.database_name: 1191 # This must be a continuation of the previous name. 1192 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1193 line.strip())
1194
1195 - def posted_date(self, line):
1196 self._dr.posted_date.append(_re_search( 1197 r"Posted date:\s*(.+)$", line, 1198 "I could not find the posted date in line\n%s" % line))
1199
1200 - def num_letters_in_database(self, line):
1201 letters, = _get_cols( 1202 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1203 self._dr.num_letters_in_database.append(_safe_int(letters))
1204
1205 - def num_sequences_in_database(self, line):
1206 sequences, = _get_cols( 1207 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1208 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1209
1210 - def ka_params(self, line):
1211 x = line.split() 1212 self._dr.ka_params = map(_safe_float, x)
1213
1214 - def gapped(self, line):
1215 self._dr.gapped = 1
1216
1217 - def ka_params_gap(self, line):
1218 x = line.split() 1219 self._dr.ka_params_gap = map(_safe_float, x)
1220
1221 - def end_database_report(self):
1222 pass
1223
1224 -class _ParametersConsumer:
1225 - def start_parameters(self):
1226 self._params = Record.Parameters()
1227
1228 - def matrix(self, line):
1229 self._params.matrix = line[8:].rstrip()
1230
1231 - def gap_penalties(self, line):
1232 x = _get_cols( 1233 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1234 self._params.gap_penalties = map(_safe_float, x)
1235
1236 - def num_hits(self, line):
1237 if line.find('1st pass') != -1: 1238 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1239 self._params.num_hits = _safe_int(x) 1240 else: 1241 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1242 self._params.num_hits = _safe_int(x)
1243
1244 - def num_sequences(self, line):
1245 if line.find('1st pass') != -1: 1246 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1247 self._params.num_sequences = _safe_int(x) 1248 else: 1249 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1250 self._params.num_sequences = _safe_int(x)
1251
1252 - def num_extends(self, line):
1253 if line.find('1st pass') != -1: 1254 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1255 self._params.num_extends = _safe_int(x) 1256 else: 1257 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1258 self._params.num_extends = _safe_int(x)
1259
1260 - def num_good_extends(self, line):
1261 if line.find('1st pass') != -1: 1262 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1263 self._params.num_good_extends = _safe_int(x) 1264 else: 1265 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1266 self._params.num_good_extends = _safe_int(x)
1267
1268 - def num_seqs_better_e(self, line):
1269 self._params.num_seqs_better_e, = _get_cols( 1270 line, (-1,), ncols=7, expected={2:"sequences"}) 1271 self._params.num_seqs_better_e = _safe_int( 1272 self._params.num_seqs_better_e)
1273
1274 - def hsps_no_gap(self, line):
1275 self._params.hsps_no_gap, = _get_cols( 1276 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1277 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1278
1279 - def hsps_prelim_gapped(self, line):
1280 self._params.hsps_prelim_gapped, = _get_cols( 1281 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1282 self._params.hsps_prelim_gapped = _safe_int( 1283 self._params.hsps_prelim_gapped)
1284
1285 - def hsps_prelim_gapped_attempted(self, line):
1286 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1287 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1288 self._params.hsps_prelim_gapped_attempted = _safe_int( 1289 self._params.hsps_prelim_gapped_attempted)
1290
1291 - def hsps_gapped(self, line):
1292 self._params.hsps_gapped, = _get_cols( 1293 line, (-1,), ncols=6, expected={3:"gapped"}) 1294 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1295
1296 - def query_length(self, line):
1297 self._params.query_length, = _get_cols( 1298 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1299 self._params.query_length = _safe_int(self._params.query_length)
1300
1301 - def database_length(self, line):
1302 self._params.database_length, = _get_cols( 1303 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1304 self._params.database_length = _safe_int(self._params.database_length)
1305
1306 - def effective_hsp_length(self, line):
1307 self._params.effective_hsp_length, = _get_cols( 1308 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1309 self._params.effective_hsp_length = _safe_int( 1310 self._params.effective_hsp_length)
1311
1312 - def effective_query_length(self, line):
1313 self._params.effective_query_length, = _get_cols( 1314 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1315 self._params.effective_query_length = _safe_int( 1316 self._params.effective_query_length)
1317
1318 - def effective_database_length(self, line):
1319 self._params.effective_database_length, = _get_cols( 1320 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1321 self._params.effective_database_length = _safe_int( 1322 self._params.effective_database_length)
1323
1324 - def effective_search_space(self, line):
1325 self._params.effective_search_space, = _get_cols( 1326 line, (-1,), ncols=4, expected={1:"search"}) 1327 self._params.effective_search_space = _safe_int( 1328 self._params.effective_search_space)
1329
1330 - def effective_search_space_used(self, line):
1331 self._params.effective_search_space_used, = _get_cols( 1332 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1333 self._params.effective_search_space_used = _safe_int( 1334 self._params.effective_search_space_used)
1335
1336 - def frameshift(self, line):
1337 self._params.frameshift = _get_cols( 1338 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1339
1340 - def threshold(self, line):
1341 if line[:2] == "T:" : 1342 #Assume its an old stlye line like "T: 123" 1343 self._params.threshold, = _get_cols( 1344 line, (1,), ncols=2, expected={0:"T:"}) 1345 elif line[:28] == "Neighboring words threshold:" : 1346 self._params.threshold, = _get_cols( 1347 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1348 else : 1349 raise ValueError("Unrecognised threshold line:\n%s" % line) 1350 self._params.threshold = _safe_int(self._params.threshold)
1351
1352 - def window_size(self, line):
1353 if line[:2] == "A:" : 1354 self._params.window_size, = _get_cols( 1355 line, (1,), ncols=2, expected={0:"A:"}) 1356 elif line[:25] == "Window for multiple hits:" : 1357 self._params.window_size, = _get_cols( 1358 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1359 else : 1360 raise ValueError("Unrecognised window size line:\n%s" % line) 1361 self._params.window_size = _safe_int(self._params.window_size)
1362
1363 - def dropoff_1st_pass(self, line):
1364 score, bits = _re_search( 1365 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1366 "I could not find the dropoff in line\n%s" % line) 1367 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1368
1369 - def gap_x_dropoff(self, line):
1370 score, bits = _re_search( 1371 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1372 "I could not find the gap dropoff in line\n%s" % line) 1373 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1374
1375 - def gap_x_dropoff_final(self, line):
1376 score, bits = _re_search( 1377 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1378 "I could not find the gap dropoff final in line\n%s" % line) 1379 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1380
1381 - def gap_trigger(self, line):
1382 score, bits = _re_search( 1383 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1384 "I could not find the gap trigger in line\n%s" % line) 1385 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1386
1387 - def blast_cutoff(self, line):
1388 score, bits = _re_search( 1389 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1390 "I could not find the blast cutoff in line\n%s" % line) 1391 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1392
1393 - def end_parameters(self):
1394 pass
1395 1396
1397 -class _BlastConsumer(AbstractConsumer, 1398 _HeaderConsumer, 1399 _DescriptionConsumer, 1400 _AlignmentConsumer, 1401 _HSPConsumer, 1402 _DatabaseReportConsumer, 1403 _ParametersConsumer 1404 ):
1405 # This Consumer is inherits from many other consumer classes that handle 1406 # the actual dirty work. An alternate way to do it is to create objects 1407 # of those classes and then delegate the parsing tasks to them in a 1408 # decorator-type pattern. The disadvantage of that is that the method 1409 # names will need to be resolved in this classes. However, using 1410 # a decorator will retain more control in this class (which may or 1411 # may not be a bad thing). In addition, having each sub-consumer as 1412 # its own object prevents this object's dictionary from being cluttered 1413 # with members and reduces the chance of member collisions.
1414 - def __init__(self):
1415 self.data = None
1416
1417 - def round(self, line):
1418 # Make sure nobody's trying to pass me PSI-BLAST data! 1419 raise ValueError, \ 1420 "This consumer doesn't handle PSI-BLAST data"
1421
1422 - def start_header(self):
1423 self.data = Record.Blast() 1424 _HeaderConsumer.start_header(self)
1425
1426 - def end_header(self):
1427 _HeaderConsumer.end_header(self) 1428 self.data.__dict__.update(self._header.__dict__)
1429
1430 - def end_descriptions(self):
1431 self.data.descriptions = self._descriptions
1432
1433 - def end_alignment(self):
1434 _AlignmentConsumer.end_alignment(self) 1435 if self._alignment.hsps: 1436 self.data.alignments.append(self._alignment) 1437 if self._multiple_alignment.alignment: 1438 self.data.multiple_alignment = self._multiple_alignment
1439
1440 - def end_hsp(self):
1441 _HSPConsumer.end_hsp(self) 1442 try: 1443 self._alignment.hsps.append(self._hsp) 1444 except AttributeError: 1445 raise ValueError, "Found an HSP before an alignment"
1446
1447 - def end_database_report(self):
1448 _DatabaseReportConsumer.end_database_report(self) 1449 self.data.__dict__.update(self._dr.__dict__)
1450
1451 - def end_parameters(self):
1452 _ParametersConsumer.end_parameters(self) 1453 self.data.__dict__.update(self._params.__dict__)
1454
1455 -class _PSIBlastConsumer(AbstractConsumer, 1456 _HeaderConsumer, 1457 _DescriptionConsumer, 1458 _AlignmentConsumer, 1459 _HSPConsumer, 1460 _DatabaseReportConsumer, 1461 _ParametersConsumer 1462 ):
1463 - def __init__(self):
1464 self.data = None
1465
1466 - def start_header(self):
1467 self.data = Record.PSIBlast() 1468 _HeaderConsumer.start_header(self)
1469
1470 - def end_header(self):
1471 _HeaderConsumer.end_header(self) 1472 self.data.__dict__.update(self._header.__dict__)
1473
1474 - def start_descriptions(self):
1475 self._round = Record.Round() 1476 self.data.rounds.append(self._round) 1477 _DescriptionConsumer.start_descriptions(self)
1478
1479 - def end_descriptions(self):
1480 _DescriptionConsumer.end_descriptions(self) 1481 self._round.number = self._roundnum 1482 if self._descriptions: 1483 self._round.new_seqs.extend(self._descriptions) 1484 self._round.reused_seqs.extend(self._model_sequences) 1485 self._round.new_seqs.extend(self._nonmodel_sequences) 1486 if self._converged: 1487 self.data.converged = 1
1488
1489 - def end_alignment(self):
1490 _AlignmentConsumer.end_alignment(self) 1491 if self._alignment.hsps: 1492 self._round.alignments.append(self._alignment) 1493 if self._multiple_alignment: 1494 self._round.multiple_alignment = self._multiple_alignment
1495
1496 - def end_hsp(self):
1497 _HSPConsumer.end_hsp(self) 1498 try: 1499 self._alignment.hsps.append(self._hsp) 1500 except AttributeError: 1501 raise ValueError, "Found an HSP before an alignment"
1502
1503 - def end_database_report(self):
1504 _DatabaseReportConsumer.end_database_report(self) 1505 self.data.__dict__.update(self._dr.__dict__)
1506
1507 - def end_parameters(self):
1508 _ParametersConsumer.end_parameters(self) 1509 self.data.__dict__.update(self._params.__dict__)
1510
1511 -class Iterator:
1512 """Iterates over a file of multiple BLAST results. 1513 1514 Methods: 1515 next Return the next record from the stream, or None. 1516 1517 """
1518 - def __init__(self, handle, parser=None):
1519 """__init__(self, handle, parser=None) 1520 1521 Create a new iterator. handle is a file-like object. parser 1522 is an optional Parser object to change the results into another form. 1523 If set to None, then the raw contents of the file will be returned. 1524 1525 """ 1526 try: 1527 handle.readline 1528 except AttributeError: 1529 raise ValueError( 1530 "I expected a file handle or file-like object, got %s" 1531 % type(handle)) 1532 self._uhandle = File.UndoHandle(handle) 1533 self._parser = parser
1534
1535 - def next(self):
1536 """next(self) -> object 1537 1538 Return the next Blast record from the file. If no more records, 1539 return None. 1540 1541 """ 1542 lines = [] 1543 while 1: 1544 line = self._uhandle.readline() 1545 if not line: 1546 break 1547 # If I've reached the next one, then put the line back and stop. 1548 if lines and (line.startswith('BLAST') 1549 or line.startswith('BLAST', 1) 1550 or line.startswith('<?xml ')): 1551 self._uhandle.saveline(line) 1552 break 1553 lines.append(line) 1554 1555 if not lines: 1556 return None 1557 1558 data = ''.join(lines) 1559 if self._parser is not None: 1560 return self._parser.parse(File.StringHandle(data)) 1561 return data
1562
1563 - def __iter__(self):
1564 return iter(self.next, None)
1565
1566 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1567 """blastall(blastcmd, program, database, infile, align_view='7', **keywds) 1568 -> read, error Undohandles 1569 1570 Execute and retrieve data from blastall. blastcmd is the command 1571 used to launch the 'blastall' executable. program is the blast program 1572 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1573 to search against. infile is the path to the file containing 1574 the sequence to search with. 1575 1576 You may pass more parameters to **keywds to change the behavior of 1577 the search. Otherwise, optional values will be chosen by blastall. 1578 The Blast output is by default in XML format. Use the align_view keyword 1579 for output in a different format. 1580 1581 Scoring 1582 matrix Matrix to use. 1583 gap_open Gap open penalty. 1584 gap_extend Gap extension penalty. 1585 nuc_match Nucleotide match reward. (BLASTN) 1586 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1587 query_genetic_code Genetic code for Query. 1588 db_genetic_code Genetic code for database. (TBLAST[NX]) 1589 1590 Algorithm 1591 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1592 expectation Expectation value cutoff. 1593 wordsize Word size. 1594 strands Query strands to search against database.([T]BLAST[NX]) 1595 keep_hits Number of best hits from a region to keep. 1596 xdrop Dropoff value (bits) for gapped alignments. 1597 hit_extend Threshold for extending hits. 1598 region_length Length of region used to judge hits. 1599 db_length Effective database length. 1600 search_length Effective length of search space. 1601 1602 Processing 1603 filter Filter query sequence for low complexity (with SEG)? T/F 1604 believe_query Believe the query defline. T/F 1605 restrict_gi Restrict search to these GI's. 1606 nprocessors Number of processors to use. 1607 oldengine Force use of old engine T/F 1608 1609 Formatting 1610 html Produce HTML output? T/F 1611 descriptions Number of one-line descriptions. 1612 alignments Number of alignments. 1613 align_view Alignment view. Integer 0-11, 1614 passed as a string or integer. 1615 show_gi Show GI's in deflines? T/F 1616 seqalign_file seqalign file to output. 1617 1618 """ 1619 1620 _security_check_parameters(keywds) 1621 1622 1623 _security_check_parameters(keywds) 1624 1625 att2param = { 1626 'matrix' : '-M', 1627 'gap_open' : '-G', 1628 'gap_extend' : '-E', 1629 'nuc_match' : '-r', 1630 'nuc_mismatch' : '-q', 1631 'query_genetic_code' : '-Q', 1632 'db_genetic_code' : '-D', 1633 1634 'gapped' : '-g', 1635 'expectation' : '-e', 1636 'wordsize' : '-W', 1637 'strands' : '-S', 1638 'keep_hits' : '-K', 1639 'xdrop' : '-X', 1640 'hit_extend' : '-f', 1641 'region_length' : '-L', 1642 'db_length' : '-z', 1643 'search_length' : '-Y', 1644 1645 'program' : '-p', 1646 'database' : '-d', 1647 'infile' : '-i', 1648 'filter' : '-F', 1649 'believe_query' : '-J', 1650 'restrict_gi' : '-l', 1651 'nprocessors' : '-a', 1652 'oldengine' : '-V', 1653 1654 'html' : '-T', 1655 'descriptions' : '-v', 1656 'alignments' : '-b', 1657 'align_view' : '-m', 1658 'show_gi' : '-I', 1659 'seqalign_file' : '-O' 1660 } 1661 1662 if not os.path.exists(blastcmd): 1663 raise ValueError, "blastall does not exist at %s" % blastcmd 1664 1665 params = [] 1666 1667 params.extend([att2param['program'], program]) 1668 params.extend([att2param['database'], database]) 1669 params.extend([att2param['infile'], infile]) 1670 params.extend([att2param['align_view'], str(align_view)]) 1671 1672 for attr in keywds.keys(): 1673 params.extend([att2param[attr], str(keywds[attr])]) 1674 1675 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1676 w.close() 1677 return File.UndoHandle(r), File.UndoHandle(e)
1678 1679
1680 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1681 """blastpgp(blastcmd, database, infile, align_view='7', **keywds) -> 1682 read, error Undohandles 1683 1684 Execute and retrieve data from blastpgp. blastcmd is the command 1685 used to launch the 'blastpgp' executable. database is the path to the 1686 database to search against. infile is the path to the file containing 1687 the sequence to search with. 1688 1689 You may pass more parameters to **keywds to change the behavior of 1690 the search. Otherwise, optional values will be chosen by blastpgp. 1691 The Blast output is by default in XML format. Use the align_view keyword 1692 for output in a different format. 1693 1694 Scoring 1695 matrix Matrix to use. 1696 gap_open Gap open penalty. 1697 gap_extend Gap extension penalty. 1698 window_size Multiple hits window size. 1699 npasses Number of passes. 1700 passes Hits/passes. Integer 0-2. 1701 1702 Algorithm 1703 gapped Whether to do a gapped alignment. T/F 1704 expectation Expectation value cutoff. 1705 wordsize Word size. 1706 keep_hits Number of beset hits from a region to keep. 1707 xdrop Dropoff value (bits) for gapped alignments. 1708 hit_extend Threshold for extending hits. 1709 region_length Length of region used to judge hits. 1710 db_length Effective database length. 1711 search_length Effective length of search space. 1712 nbits_gapping Number of bits to trigger gapping. 1713 pseudocounts Pseudocounts constants for multiple passes. 1714 xdrop_final X dropoff for final gapped alignment. 1715 xdrop_extension Dropoff for blast extensions. 1716 model_threshold E-value threshold to include in multipass model. 1717 required_start Start of required region in query. 1718 required_end End of required region in query. 1719 1720 Processing 1721 XXX should document default values 1722 program The blast program to use. (PHI-BLAST) 1723 filter Filter query sequence for low complexity (with SEG)? T/F 1724 believe_query Believe the query defline? T/F 1725 nprocessors Number of processors to use. 1726 1727 Formatting 1728 html Produce HTML output? T/F 1729 descriptions Number of one-line descriptions. 1730 alignments Number of alignments. 1731 align_view Alignment view. Integer 0-11, 1732 passed as a string or integer. 1733 show_gi Show GI's in deflines? T/F 1734 seqalign_file seqalign file to output. 1735 align_outfile Output file for alignment. 1736 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1737 restart_infile Input file for PSI-BLAST restart. 1738 hit_infile Hit file for PHI-BLAST. 1739 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1740 1741 _security_check_parameters(keywds) 1742 1743 align_infile Input alignment file for PSI-BLAST restart. 1744 1745 """ 1746 1747 _security_check_parameters(keywds) 1748 1749 att2param = { 1750 'matrix' : '-M', 1751 'gap_open' : '-G', 1752 'gap_extend' : '-E', 1753 'window_size' : '-A', 1754 'npasses' : '-j', 1755 'passes' : '-P', 1756 1757 'gapped' : '-g', 1758 'expectation' : '-e', 1759 'wordsize' : '-W', 1760 'keep_hits' : '-K', 1761 'xdrop' : '-X', 1762 'hit_extend' : '-f', 1763 'region_length' : '-L', 1764 'db_length' : '-Z', 1765 'search_length' : '-Y', 1766 'nbits_gapping' : '-N', 1767 'pseudocounts' : '-c', 1768 'xdrop_final' : '-Z', 1769 'xdrop_extension' : '-y', 1770 'model_threshold' : '-h', 1771 'required_start' : '-S', 1772 'required_end' : '-H', 1773 1774 'program' : '-p', 1775 'database' : '-d', 1776 'infile' : '-i', 1777 'filter' : '-F', 1778 'believe_query' : '-J', 1779 'nprocessors' : '-a', 1780 1781 'html' : '-T', 1782 'descriptions' : '-v', 1783 'alignments' : '-b', 1784 'align_view' : '-m', 1785 'show_gi' : '-I', 1786 'seqalign_file' : '-O', 1787 'align_outfile' : '-o', 1788 'checkpoint_outfile' : '-C', 1789 'restart_infile' : '-R', 1790 'hit_infile' : '-k', 1791 'matrix_outfile' : '-Q', 1792 'align_infile' : '-B' 1793 } 1794 1795 if not os.path.exists(blastcmd): 1796 raise ValueError, "blastpgp does not exist at %s" % blastcmd 1797 1798 params = [] 1799 1800 params.extend([att2param['database'], database]) 1801 params.extend([att2param['infile'], infile]) 1802 params.extend([att2param['align_view'], str(align_view)]) 1803 1804 for attr in keywds.keys(): 1805 params.extend([att2param[attr], str(keywds[attr])]) 1806 1807 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1808 w.close() 1809 return File.UndoHandle(r), File.UndoHandle(e)
1810 1811
1812 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1813 """rpsblast(blastcmd, database, infile, **keywds) -> 1814 read, error Undohandles 1815 1816 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1817 command used to launch the 'rpsblast' executable. database is the path 1818 to the database to search against. infile is the path to the file 1819 containing the sequence to search with. 1820 1821 You may pass more parameters to **keywds to change the behavior of 1822 the search. Otherwise, optional values will be chosen by rpsblast. 1823 1824 Please note that this function will give XML output by default, by 1825 setting align_view to seven (i.e. command line option -m 7). 1826 You should use the NCBIXML.parse() function to read the resulting output. 1827 This is because NCBIStandalone.BlastParser() does not understand the 1828 plain text output format from rpsblast. 1829 1830 WARNING - The following text and associated parameter handling has not 1831 received extensive testing. Please report any errors we might have made... 1832 1833 Algorithm/Scoring 1834 gapped Whether to do a gapped alignment. T/F 1835 multihit 0 for multiple hit (default), 1 for single hit 1836 expectation Expectation value cutoff. 1837 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1838 0 in 'start' refers to the beginning of the sequence 1839 0 in 'stop' refers to the end of the sequence 1840 Default = 0,0 1841 xdrop Dropoff value (bits) for gapped alignments. 1842 xdrop_final X dropoff for final gapped alignment (in bits). 1843 xdrop_extension Dropoff for blast extensions (in bits). 1844 search_length Effective length of search space. 1845 nbits_gapping Number of bits to trigger gapping. 1846 protein Query sequence is protein. T/F 1847 db_length Effective database length. 1848 1849 Processing 1850 filter Filter query sequence for low complexity? T/F 1851 case_filter Use lower case filtering of FASTA sequence T/F, default F 1852 believe_query Believe the query defline. T/F 1853 nprocessors Number of processors to use. 1854 logfile Name of log file to use, default rpsblast.log 1855 1856 Formatting 1857 html Produce HTML output? T/F 1858 descriptions Number of one-line descriptions. 1859 alignments Number of alignments. 1860 align_view Alignment view. Integer 0-11, 1861 1862 _security_check_parameters(keywds) 1863 1864 passed as a string or integer. 1865 show_gi Show GI's in deflines? T/F 1866 seqalign_file seqalign file to output. 1867 align_outfile Output file for alignment. 1868 1869 """ 1870 1871 _security_check_parameters(keywds) 1872 1873 att2param = { 1874 'multihit' : '-P', 1875 'gapped' : '-g', 1876 'expectation' : '-e', 1877 'range_restriction' : '-L', 1878 'xdrop' : '-X', 1879 'xdrop_final' : '-Z', 1880 'xdrop_extension' : '-y', 1881 'search_length' : '-Y', 1882 'nbits_gapping' : '-N', 1883 'protein' : '-p', 1884 'db_length' : '-z', 1885 1886 'database' : '-d', 1887 'infile' : '-i', 1888 'filter' : '-F', 1889 'case_filter' : '-U', 1890 'believe_query' : '-J', 1891 'nprocessors' : '-a', 1892 'logfile' : '-l', 1893 1894 'html' : '-T', 1895 'descriptions' : '-v', 1896 'alignments' : '-b', 1897 'align_view' : '-m', 1898 'show_gi' : '-I', 1899 'seqalign_file' : '-O', 1900 'align_outfile' : '-o' 1901 } 1902 1903 if not os.path.exists(blastcmd): 1904 raise ValueError, "rpsblast does not exist at %s" % blastcmd 1905 1906 params = [] 1907 1908 params.extend([att2param['database'], database]) 1909 params.extend([att2param['infile'], infile]) 1910 params.extend([att2param['align_view'], str(align_view)]) 1911 1912 for attr in keywds.keys(): 1913 params.extend([att2param[attr], str(keywds[attr])]) 1914 1915 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1916 w.close() 1917 return File.UndoHandle(r), File.UndoHandle(e)
1918
1919 -def _re_search(regex, line, error_msg):
1920 m = re.search(regex, line) 1921 if not m: 1922 raise ValueError, error_msg 1923 return m.groups()
1924
1925 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1926 cols = line.split() 1927 1928 # Check to make sure number of columns is correct 1929 if ncols is not None and len(cols) != ncols: 1930 raise ValueError, "I expected %d columns (got %d) in line\n%s" % \ 1931 (ncols, len(cols), line) 1932 1933 # Check to make sure columns contain the correct data 1934 for k in expected.keys(): 1935 if cols[k] != expected[k]: 1936 raise ValueError, "I expected '%s' in column %d in line\n%s" % ( 1937 expected[k], k, line) 1938 1939 # Construct the answer tuple 1940 results = [] 1941 for c in cols_to_get: 1942 results.append(cols[c]) 1943 return tuple(results)
1944
1945 -def _safe_int(str):
1946 try: 1947 return int(str) 1948 except ValueError: 1949 # Something went wrong. Try to clean up the string. 1950 # Remove all commas from the string 1951 str = str.replace(',', '') 1952 try: 1953 # try again. 1954 return int(str) 1955 except ValueError: 1956 pass 1957 # If it fails again, maybe it's too long? 1958 # XXX why converting to float? 1959 return long(float(str))
1960
1961 -def _safe_float(str):
1962 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1963 # float('e-172') does not produce an error on his platform. Thus, 1964 # we need to check the string for this condition. 1965 1966 # Sometimes BLAST leaves of the '1' in front of an exponent. 1967 if str and str[0] in ['E', 'e']: 1968 str = '1' + str 1969 try: 1970 return float(str) 1971 except ValueError: 1972 # Remove all commas from the string 1973 str = str.replace(',', '') 1974 # try again. 1975 return float(str)
1976
1977 -def _security_check_parameters(param_dict) :
1978 """Look for any attempt to insert a command into a parameter. 1979 1980 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 1981 1982 Looks for ";" or "&&" in the strings (Unix and Windows syntax 1983 for appending a command line), and if found raises an exception. 1984 """ 1985 for key, value in param_dict.iteritems() : 1986 if ";" in value or "&&" in value : 1987 raise ValueError("Rejecting suspicious argument for %s" % key)
1988 1989
1990 -class _BlastErrorConsumer(_BlastConsumer):
1991 - def __init__(self):
1993 - def noevent(self, line):
1994 if line.find("Query must be at least wordsize") != -1: 1995 raise ShortQueryBlastError, "Query must be at least wordsize" 1996 # Now pass the line back up to the superclass. 1997 method = getattr(_BlastConsumer, 'noevent', 1998 _BlastConsumer.__getattr__(self, 'noevent')) 1999 method(line)
2000
2001 -class BlastErrorParser(AbstractParser):
2002 """Attempt to catch and diagnose BLAST errors while parsing. 2003 2004 This utilizes the BlastParser module but adds an additional layer 2005 of complexity on top of it by attempting to diagnose ValueErrors 2006 that may actually indicate problems during BLAST parsing. 2007 2008 Current BLAST problems this detects are: 2009 o LowQualityBlastError - When BLASTing really low quality sequences 2010 (ie. some GenBank entries which are just short streches of a single 2011 nucleotide), BLAST will report an error with the sequence and be 2012 unable to search with this. This will lead to a badly formatted 2013 BLAST report that the parsers choke on. The parser will convert the 2014 ValueError to a LowQualityBlastError and attempt to provide useful 2015 information. 2016 2017 """
2018 - def __init__(self, bad_report_handle = None):
2019 """Initialize a parser that tries to catch BlastErrors. 2020 2021 Arguments: 2022 o bad_report_handle - An optional argument specifying a handle 2023 where bad reports should be sent. This would allow you to save 2024 all of the bad reports to a file, for instance. If no handle 2025 is specified, the bad reports will not be saved. 2026 """ 2027 self._bad_report_handle = bad_report_handle 2028 2029 #self._b_parser = BlastParser() 2030 self._scanner = _Scanner() 2031 self._consumer = _BlastErrorConsumer()
2032
2033 - def parse(self, handle):
2034 """Parse a handle, attempting to diagnose errors. 2035 """ 2036 results = handle.read() 2037 2038 try: 2039 self._scanner.feed(File.StringHandle(results), self._consumer) 2040 except ValueError, msg: 2041 # if we have a bad_report_file, save the info to it first 2042 if self._bad_report_handle: 2043 # send the info to the error handle 2044 self._bad_report_handle.write(results) 2045 2046 # now we want to try and diagnose the error 2047 self._diagnose_error( 2048 File.StringHandle(results), self._consumer.data) 2049 2050 # if we got here we can't figure out the problem 2051 # so we should pass along the syntax error we got 2052 raise 2053 return self._consumer.data
2054
2055 - def _diagnose_error(self, handle, data_record):
2056 """Attempt to diagnose an error in the passed handle. 2057 2058 Arguments: 2059 o handle - The handle potentially containing the error 2060 o data_record - The data record partially created by the consumer. 2061 """ 2062 line = handle.readline() 2063 2064 while line: 2065 # 'Searchingdone' instead of 'Searching......done' seems 2066 # to indicate a failure to perform the BLAST due to 2067 # low quality sequence 2068 if line.startswith('Searchingdone'): 2069 raise LowQualityBlastError("Blast failure occured on query: ", 2070 data_record.query) 2071 line = handle.readline()
2072