Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST (direct use discouraged). 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15                      You are expected to use this via the parse function. 
 16   
 17  _XMLParser          Generic SAX parser (private). 
 18   
 19  Functions: 
 20  parse               Incremental parser, this is an iterator that returns 
 21                      Blast records.  It uses the BlastParser internally. 
 22  """ 
 23  from Bio.Blast import Record 
 24  import xml.sax 
 25  from xml.sax.handler import ContentHandler 
 26   
27 -class _XMLparser(ContentHandler):
28 """Generic SAX Parser 29 30 Just a very basic SAX parser. 31 32 Redefine the methods startElement, characters and endElement. 33 """
34 - def __init__(self, debug=0):
35 """Constructor 36 37 debug - integer, amount of debug information to print 38 """ 39 self._tag = [] 40 self._value = '' 41 self._debug = debug 42 self._debug_ignore_list = []
43
44 - def _secure_name(self, name):
45 """Removes 'dangerous' from tag names 46 47 name -- name to be 'secured' 48 """ 49 # Replace '-' with '_' in XML tag names 50 return name.replace('-', '_')
51
52 - def startElement(self, name, attr):
53 """Found XML start tag 54 55 No real need of attr, BLAST DTD doesn't use them 56 57 name -- name of the tag 58 59 attr -- tag attributes 60 """ 61 self._tag.append(name) 62 63 # Try to call a method (defined in subclasses) 64 method = self._secure_name('_start_' + name) 65 66 #Note could use try / except AttributeError 67 #BUT I found often triggered by nested errors... 68 if hasattr(self, method) : 69 eval("self.%s()" % method) 70 if self._debug > 4 : 71 print "NCBIXML: Parsed: " + method 72 else : 73 # Doesn't exist (yet) 74 if method not in self._debug_ignore_list : 75 if self._debug > 3 : 76 print "NCBIXML: Ignored: " + method 77 self._debug_ignore_list.append(method)
78
79 - def characters(self, ch):
80 """Found some text 81 82 ch -- characters read 83 """ 84 self._value += ch # You don't ever get the whole string
85
86 - def endElement(self, name):
87 """Found XML end tag 88 89 name -- tag name 90 """ 91 # Strip character buffer 92 self._value = self._value.strip() 93 94 # Try to call a method (defined in subclasses) 95 method = self._secure_name('_end_' + name) 96 #Note could use try / except AttributeError 97 #BUT I found often triggered by nested errors... 98 if hasattr(self, method) : 99 eval("self.%s()" % method) 100 if self._debug > 2 : 101 print "NCBIXML: Parsed: " + method, self._value 102 else : 103 # Doesn't exist (yet) 104 if method not in self._debug_ignore_list : 105 if self._debug > 1 : 106 print "NCBIXML: Ignored: " + method, self._value 107 self._debug_ignore_list.append(method) 108 109 # Reset character buffer 110 self._value = ''
111
112 -class BlastParser(_XMLparser):
113 """Parse XML BLAST data into a Record.Blast object 114 115 All XML 'action' methods are private methods and may be: 116 _start_TAG called when the start tag is found 117 _end_TAG called when the end tag is found 118 """ 119
120 - def __init__(self, debug=0):
121 """Constructor 122 123 debug - integer, amount of debug information to print 124 """ 125 # Calling superclass method 126 _XMLparser.__init__(self, debug) 127 128 self._parser = xml.sax.make_parser() 129 self._parser.setContentHandler(self) 130 131 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 132 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 133 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 134 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 135 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 136 137 self.reset()
138
139 - def reset(self) :
140 """Reset all the data allowing reuse of the BlastParser() object""" 141 self._records = [] 142 self._header = Record.Header() 143 self._parameters = Record.Parameters() 144 self._parameters.filter = None #Maybe I should update the class?
145
146 - def _start_Iteration(self):
147 self._blast = Record.Blast() 148 pass
149
150 - def _end_Iteration(self):
151 # We stored a lot of generic "top level" information 152 # in self._header (an object of type Record.Header) 153 self._blast.reference = self._header.reference 154 self._blast.date = self._header.date 155 self._blast.version = self._header.version 156 self._blast.database = self._header.database 157 self._blast.application = self._header.application 158 159 # These are required for "old" pre 2.2.14 files 160 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 161 # and <BlastOutput_query-len> were used. Now they 162 # are suplemented/replaced by <Iteration_query-ID>, 163 # <Iteration_query-def> and <Iteration_query-len> 164 if not hasattr(self._blast, "query") \ 165 or not self._blast.query : 166 self._blast.query = self._header.query 167 if not hasattr(self._blast, "query_id") \ 168 or not self._blast.query_id : 169 self._blast.query_id = self._header.query_id 170 if not hasattr(self._blast, "query_letters") \ 171 or not self._blast.query_letters : 172 self._blast.query_letters = self._header.query_letters 173 174 # Hack to record the query length as both the query_letters and 175 # query_length properties (as in the plain text parser, see 176 # Bug 2176 comment 12): 177 self._blast.query_length = self._blast.query_letters 178 # Perhaps in the long term we should deprecate one, but I would 179 # prefer to drop query_letters - so we need a transition period 180 # with both. 181 182 # Hack to record the claimed database size as database_length 183 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 184 self._blast.database_length = self._blast.num_letters_in_database 185 # TODO? Deprecate database_letters next? 186 187 # Apply the "top level" parameter information 188 self._blast.matrix = self._parameters.matrix 189 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 190 self._blast.gap_penalties = self._parameters.gap_penalties 191 self._blast.filter = self._parameters.filter 192 self._blast.expect = self._parameters.expect 193 self._blast.sc_match = self._parameters.sc_match 194 self._blast.sc_mismatch = self._parameters.sc_mismatch 195 196 #Add to the list 197 self._records.append(self._blast) 198 #Clear the object (a new empty one is create in _start_Iteration) 199 self._blast = None 200 201 if self._debug : "NCBIXML: Added Blast record to results"
202 203 # Header
204 - def _end_BlastOutput_program(self):
205 """BLAST program, e.g., blastp, blastn, etc. 206 207 Save this to put on each blast record object 208 """ 209 self._header.application = self._value.upper()
210
211 - def _end_BlastOutput_version(self):
212 """version number and date of the BLAST engine. 213 214 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 215 variants like "BLASTP 2.2.18+" without the date. 216 217 Save this to put on each blast record object 218 """ 219 parts = self._value.split() 220 #TODO - Check the first word starts with BLAST? 221 222 #The version is the second word (field one) 223 self._header.version = parts[1] 224 225 #Check there is a third word (the date) 226 if len(parts) >= 3 : 227 if parts[2][0] == "[" and parts[2][-1] == "]" : 228 self._header.date = parts[2][1:-1] 229 else : 230 #Assume this is still a date, but without the 231 #square brackets 232 self._header.date = parts[2]
233
235 """a reference to the article describing the algorithm 236 237 Save this to put on each blast record object 238 """ 239 self._header.reference = self._value
240
241 - def _end_BlastOutput_db(self):
242 """the database(s) searched 243 244 Save this to put on each blast record object 245 """ 246 self._header.database = self._value
247
249 """the identifier of the query 250 251 Important in old pre 2.2.14 BLAST, for recent versions 252 <Iteration_query-ID> is enough 253 """ 254 self._header.query_id = self._value
255
257 """the definition line of the query 258 259 Important in old pre 2.2.14 BLAST, for recent versions 260 <Iteration_query-def> is enough 261 """ 262 self._header.query = self._value
263
265 """the length of the query 266 267 Important in old pre 2.2.14 BLAST, for recent versions 268 <Iteration_query-len> is enough 269 """ 270 self._header.query_letters = int(self._value)
271
272 - def _end_Iteration_query_ID(self):
273 """the identifier of the query 274 """ 275 self._blast.query_id = self._value
276
277 - def _end_Iteration_query_def(self):
278 """the definition line of the query 279 """ 280 self._blast.query = self._value
281
282 - def _end_Iteration_query_len(self):
283 """the length of the query 284 """ 285 self._blast.query_letters = int(self._value)
286 287 ## def _end_BlastOutput_query_seq(self): 288 ## """the query sequence 289 ## """ 290 ## pass # XXX Missing in Record.Blast ? 291 292 ## def _end_BlastOutput_iter_num(self): 293 ## """the psi-blast iteration number 294 ## """ 295 ## pass # XXX TODO PSI 296
297 - def _end_BlastOutput_hits(self):
298 """hits to the database sequences, one for every sequence 299 """ 300 self._blast.num_hits = int(self._value)
301 302 ## def _end_BlastOutput_message(self): 303 ## """error messages 304 ## """ 305 ## pass # XXX What to do ? 306 307 # Parameters
308 - def _end_Parameters_matrix(self):
309 """matrix used (-M) 310 """ 311 self._parameters.matrix = self._value
312
313 - def _end_Parameters_expect(self):
314 """expect values cutoff (-e) 315 """ 316 # NOTE: In old text output there was a line: 317 # Number of sequences better than 1.0e-004: 1 318 # As far as I can see, parameters.num_seqs_better_e 319 # would take the value of 1, and the expectation 320 # value was not recorded. 321 # 322 # Anyway we should NOT record this against num_seqs_better_e 323 self._parameters.expect = self._value
324 325 ## def _end_Parameters_include(self): 326 ## """inclusion threshold for a psi-blast iteration (-h) 327 ## """ 328 ## pass # XXX TODO PSI 329
330 - def _end_Parameters_sc_match(self):
331 """match score for nucleotide-nucleotide comparaison (-r) 332 """ 333 self._parameters.sc_match = int(self._value)
334
336 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 337 """ 338 self._parameters.sc_mismatch = int(self._value)
339
340 - def _end_Parameters_gap_open(self):
341 """gap existence cost (-G) 342 """ 343 self._parameters.gap_penalties = int(self._value)
344
346 """gap extension cose (-E) 347 """ 348 self._parameters.gap_penalties = (self._parameters.gap_penalties, 349 int(self._value))
350
351 - def _end_Parameters_filter(self):
352 """filtering options (-F) 353 """ 354 self._parameters.filter = self._value
355 356 ## def _end_Parameters_pattern(self): 357 ## """pattern used for phi-blast search 358 ## """ 359 ## pass # XXX TODO PSI 360 361 ## def _end_Parameters_entrez_query(self): 362 ## """entrez query used to limit search 363 ## """ 364 ## pass # XXX TODO PSI 365 366 # Hits
367 - def _start_Hit(self):
368 self._blast.alignments.append(Record.Alignment()) 369 self._blast.descriptions.append(Record.Description()) 370 self._blast.multiple_alignment = [] 371 self._hit = self._blast.alignments[-1] 372 self._descr = self._blast.descriptions[-1] 373 self._descr.num_alignments = 0
374
375 - def _end_Hit(self):
376 #Cleanup 377 self._blast.multiple_alignment = None 378 self._hit = None 379 self._descr = None
380
381 - def _end_Hit_id(self):
382 """identifier of the database sequence 383 """ 384 self._hit.hit_id = self._value 385 self._hit.title = self._value + ' '
386
387 - def _end_Hit_def(self):
388 """definition line of the database sequence 389 """ 390 self._hit.hit_def = self._value 391 self._hit.title += self._value 392 self._descr.title = self._hit.title
393
394 - def _end_Hit_accession(self):
395 """accession of the database sequence 396 """ 397 self._hit.accession = self._value 398 self._descr.accession = self._value
399
400 - def _end_Hit_len(self):
401 self._hit.length = int(self._value)
402 403 # HSPs
404 - def _start_Hsp(self):
405 #Note that self._start_Hit() should have been called 406 #to setup things like self._blast.multiple_alignment 407 self._hit.hsps.append(Record.HSP()) 408 self._hsp = self._hit.hsps[-1] 409 self._descr.num_alignments += 1 410 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 411 self._mult_al = self._blast.multiple_alignment[-1]
412 413 # Hsp_num is useless
414 - def _end_Hsp_score(self):
415 """raw score of HSP 416 """ 417 self._hsp.score = float(self._value) 418 if self._descr.score == None: 419 self._descr.score = float(self._value)
420
421 - def _end_Hsp_bit_score(self):
422 """bit score of HSP 423 """ 424 self._hsp.bits = float(self._value) 425 if self._descr.bits == None: 426 self._descr.bits = float(self._value)
427
428 - def _end_Hsp_evalue(self):
429 """expect value value of the HSP 430 """ 431 self._hsp.expect = float(self._value) 432 if self._descr.e == None: 433 self._descr.e = float(self._value)
434
435 - def _end_Hsp_query_from(self):
436 """offset of query at the start of the alignment (one-offset) 437 """ 438 self._hsp.query_start = int(self._value)
439
440 - def _end_Hsp_query_to(self):
441 """offset of query at the end of the alignment (one-offset) 442 """ 443 self._hsp.query_end = int(self._value)
444
445 - def _end_Hsp_hit_from(self):
446 """offset of the database at the start of the alignment (one-offset) 447 """ 448 self._hsp.sbjct_start = int(self._value)
449
450 - def _end_Hsp_hit_to(self):
451 """offset of the database at the end of the alignment (one-offset) 452 """ 453 self._hsp.sbjct_end = int(self._value)
454 455 ## def _end_Hsp_pattern_from(self): 456 ## """start of phi-blast pattern on the query (one-offset) 457 ## """ 458 ## pass # XXX TODO PSI 459 460 ## def _end_Hsp_pattern_to(self): 461 ## """end of phi-blast pattern on the query (one-offset) 462 ## """ 463 ## pass # XXX TODO PSI 464
465 - def _end_Hsp_query_frame(self):
466 """frame of the query if applicable 467 """ 468 self._hsp.frame = (int(self._value),)
469
470 - def _end_Hsp_hit_frame(self):
471 """frame of the database sequence if applicable 472 """ 473 self._hsp.frame += (int(self._value),)
474
475 - def _end_Hsp_identity(self):
476 """number of identities in the alignment 477 """ 478 self._hsp.identities = int(self._value)
479
480 - def _end_Hsp_positive(self):
481 """number of positive (conservative) substitutions in the alignment 482 """ 483 self._hsp.positives = int(self._value)
484
485 - def _end_Hsp_gaps(self):
486 """number of gaps in the alignment 487 """ 488 self._hsp.gaps = int(self._value)
489
490 - def _end_Hsp_align_len(self):
491 """length of the alignment 492 """ 493 self._hsp.align_length = int(self._value)
494 495 ## def _en_Hsp_density(self): 496 ## """score density 497 ## """ 498 ## pass # XXX ??? 499
500 - def _end_Hsp_qseq(self):
501 """alignment string for the query 502 """ 503 self._hsp.query = self._value
504
505 - def _end_Hsp_hseq(self):
506 """alignment string for the database 507 """ 508 self._hsp.sbjct = self._value
509
510 - def _end_Hsp_midline(self):
511 """Formatting middle line as normally seen in BLAST report 512 """ 513 self._hsp.match = self._value
514 515 # Statistics
516 - def _end_Statistics_db_num(self):
517 """number of sequences in the database 518 """ 519 self._blast.num_sequences_in_database = int(self._value)
520
521 - def _end_Statistics_db_len(self):
522 """number of letters in the database 523 """ 524 self._blast.num_letters_in_database = int(self._value)
525
526 - def _end_Statistics_hsp_len(self):
527 """the effective HSP length 528 """ 529 self._blast.effective_hsp_length = int(self._value)
530
532 """the effective search space 533 """ 534 self._blast.effective_search_space = float(self._value)
535
536 - def _end_Statistics_kappa(self):
537 """Karlin-Altschul parameter K 538 """ 539 self._blast.ka_params = float(self._value)
540
541 - def _end_Statistics_lambda(self):
542 """Karlin-Altschul parameter Lambda 543 """ 544 self._blast.ka_params = (float(self._value), 545 self._blast.ka_params)
546
547 - def _end_Statistics_entropy(self):
548 """Karlin-Altschul parameter H 549 """ 550 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
551
552 -def read(handle, debug=0):
553 """Returns a single Blast record (assumes just one query). 554 555 This function is for use when there is one and only one BLAST 556 result in your XML file. 557 558 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 559 one BLAST record (i.e. if you have more than one query sequence). 560 561 """ 562 iterator = parse(handle, debug) 563 try : 564 first = iterator.next() 565 except StopIteration : 566 first = None 567 if first is None : 568 raise ValueError("No records found in handle") 569 try : 570 second = iterator.next() 571 except StopIteration : 572 second = None 573 if second is not None : 574 raise ValueError("More than one record found in handle") 575 return first
576 577
578 -def parse(handle, debug=0):
579 """Returns an iterator a Blast record for each query. 580 581 handle - file handle to and XML file to parse 582 debug - integer, amount of debug information to print 583 584 This is a generator function that returns multiple Blast records 585 objects - one for each query sequence given to blast. The file 586 is read incrementally, returning complete records as they are read 587 in. 588 589 Should cope with new BLAST 2.2.14+ which gives a single XML file 590 for mutliple query records. 591 592 Should also cope with XML output from older versions BLAST which 593 gave multiple XML files concatenated together (giving a single file 594 which strictly speaking wasn't valid XML).""" 595 from xml.parsers import expat 596 BLOCK = 1024 597 MARGIN = 10 # must be at least length of newline + XML start 598 XML_START = "<?xml" 599 600 text = handle.read(BLOCK) 601 pending = "" 602 603 if not text : 604 #NO DATA FOUND! 605 raise ValueError("Your XML file was empty") 606 607 while text : 608 #We are now starting a new XML file 609 if not text.startswith(XML_START) : 610 raise ValueError("Your XML file did not start with %s..." \ 611 % XML_START) 612 613 expat_parser = expat.ParserCreate() 614 blast_parser = BlastParser(debug) 615 expat_parser.StartElementHandler = blast_parser.startElement 616 expat_parser.EndElementHandler = blast_parser.endElement 617 expat_parser.CharacterDataHandler = blast_parser.characters 618 619 expat_parser.Parse(text, False) 620 while blast_parser._records: 621 record = blast_parser._records[0] 622 blast_parser._records = blast_parser._records[1:] 623 yield record 624 625 while True : 626 #Read in another block of the file... 627 text, pending = pending + handle.read(BLOCK), "" 628 if not text: 629 #End of the file! 630 expat_parser.Parse("", True) # End of XML record 631 break 632 633 #Now read a little bit more so we can check for the 634 #start of another XML file... 635 pending = handle.read(MARGIN) 636 637 if (text+pending).find("\n" + XML_START) == -1 : 638 # Good - still dealing with the same XML file 639 expat_parser.Parse(text, False) 640 while blast_parser._records: 641 record = blast_parser._records[0] 642 blast_parser._records = blast_parser._records[1:] 643 yield record 644 else : 645 # This is output from pre 2.2.14 BLAST, 646 # one XML file for each query! 647 648 # Finish the old file: 649 text, pending = (text+pending).split("\n" + XML_START,1) 650 pending = XML_START + pending 651 652 expat_parser.Parse(text, True) # End of XML record 653 while blast_parser._records: 654 record = blast_parser._records[0] 655 blast_parser._records = blast_parser._records[1:] 656 yield record 657 658 #Now we are going to re-loop, reset the 659 #parsers and start reading the next XML file 660 text, pending = pending, "" 661 break 662 663 #At this point we have finished the first XML record. 664 #If the file is from an old version of blast, it may 665 #contain more XML records (check if text==""). 666 assert pending=="" 667 assert len(blast_parser._records) == 0 668 669 #We should have finished the file! 670 assert text=="" 671 assert pending=="" 672 assert len(blast_parser._records) == 0
673 674 if __name__ == '__main__': 675 import sys 676 import os 677 handle = open(sys.argv[1]) 678 r_list = parse(handle) 679 680 for r in r_list : 681 # Small test 682 print 'Blast of', r.query 683 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments), 684 reduce(lambda a,b: a+b, 685 [len(a.hsps) for a in r.alignments])) 686 687 for al in r.alignments: 688 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs' 689 690 # Cookbook example 691 E_VALUE_THRESH = 0.04 692 for alignment in r.alignments: 693 for hsp in alignment.hsps: 694 if hsp.expect < E_VALUE_THRESH: 695 print '*****' 696 print 'sequence', alignment.title 697 print 'length', alignment.length 698 print 'e value', hsp.expect 699 print hsp.query[:75] + '...' 700 print hsp.match[:75] + '...' 701 print hsp.sbjct[:75] + '...' 702