1
2
3
4
5
6
7 """This module provides code to work with the BLAST XML output
8 following the DTD available on the NCBI FTP
9 ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
10
11 Classes:
12 BlastParser Parses XML output from BLAST (direct use discouraged).
13 This (now) returns a list of Blast records.
14 Historically it returned a single Blast record.
15 You are expected to use this via the parse function.
16
17 _XMLParser Generic SAX parser (private).
18
19 Functions:
20 parse Incremental parser, this is an iterator that returns
21 Blast records. It uses the BlastParser internally.
22 """
23 from Bio.Blast import Record
24 import xml.sax
25 from xml.sax.handler import ContentHandler
26
28 """Generic SAX Parser
29
30 Just a very basic SAX parser.
31
32 Redefine the methods startElement, characters and endElement.
33 """
35 """Constructor
36
37 debug - integer, amount of debug information to print
38 """
39 self._tag = []
40 self._value = ''
41 self._debug = debug
42 self._debug_ignore_list = []
43
45 """Removes 'dangerous' from tag names
46
47 name -- name to be 'secured'
48 """
49
50 return name.replace('-', '_')
51
53 """Found XML start tag
54
55 No real need of attr, BLAST DTD doesn't use them
56
57 name -- name of the tag
58
59 attr -- tag attributes
60 """
61 self._tag.append(name)
62
63
64 method = self._secure_name('_start_' + name)
65
66
67
68 if hasattr(self, method) :
69 eval("self.%s()" % method)
70 if self._debug > 4 :
71 print "NCBIXML: Parsed: " + method
72 else :
73
74 if method not in self._debug_ignore_list :
75 if self._debug > 3 :
76 print "NCBIXML: Ignored: " + method
77 self._debug_ignore_list.append(method)
78
80 """Found some text
81
82 ch -- characters read
83 """
84 self._value += ch
85
87 """Found XML end tag
88
89 name -- tag name
90 """
91
92 self._value = self._value.strip()
93
94
95 method = self._secure_name('_end_' + name)
96
97
98 if hasattr(self, method) :
99 eval("self.%s()" % method)
100 if self._debug > 2 :
101 print "NCBIXML: Parsed: " + method, self._value
102 else :
103
104 if method not in self._debug_ignore_list :
105 if self._debug > 1 :
106 print "NCBIXML: Ignored: " + method, self._value
107 self._debug_ignore_list.append(method)
108
109
110 self._value = ''
111
113 """Parse XML BLAST data into a Record.Blast object
114
115 All XML 'action' methods are private methods and may be:
116 _start_TAG called when the start tag is found
117 _end_TAG called when the end tag is found
118 """
119
121 """Constructor
122
123 debug - integer, amount of debug information to print
124 """
125
126 _XMLparser.__init__(self, debug)
127
128 self._parser = xml.sax.make_parser()
129 self._parser.setContentHandler(self)
130
131
132 self._parser.setFeature(xml.sax.handler.feature_validation, 0)
133 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
134 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
135 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
136
137 self.reset()
138
140 """Reset all the data allowing reuse of the BlastParser() object"""
141 self._records = []
142 self._header = Record.Header()
143 self._parameters = Record.Parameters()
144 self._parameters.filter = None
145
149
151
152
153 self._blast.reference = self._header.reference
154 self._blast.date = self._header.date
155 self._blast.version = self._header.version
156 self._blast.database = self._header.database
157 self._blast.application = self._header.application
158
159
160
161
162
163
164 if not hasattr(self._blast, "query") \
165 or not self._blast.query :
166 self._blast.query = self._header.query
167 if not hasattr(self._blast, "query_id") \
168 or not self._blast.query_id :
169 self._blast.query_id = self._header.query_id
170 if not hasattr(self._blast, "query_letters") \
171 or not self._blast.query_letters :
172 self._blast.query_letters = self._header.query_letters
173
174
175
176
177 self._blast.query_length = self._blast.query_letters
178
179
180
181
182
183
184 self._blast.database_length = self._blast.num_letters_in_database
185
186
187
188 self._blast.matrix = self._parameters.matrix
189 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
190 self._blast.gap_penalties = self._parameters.gap_penalties
191 self._blast.filter = self._parameters.filter
192 self._blast.expect = self._parameters.expect
193 self._blast.sc_match = self._parameters.sc_match
194 self._blast.sc_mismatch = self._parameters.sc_mismatch
195
196
197 self._records.append(self._blast)
198
199 self._blast = None
200
201 if self._debug : "NCBIXML: Added Blast record to results"
202
203
205 """BLAST program, e.g., blastp, blastn, etc.
206
207 Save this to put on each blast record object
208 """
209 self._header.application = self._value.upper()
210
212 """version number and date of the BLAST engine.
213
214 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
215 variants like "BLASTP 2.2.18+" without the date.
216
217 Save this to put on each blast record object
218 """
219 parts = self._value.split()
220
221
222
223 self._header.version = parts[1]
224
225
226 if len(parts) >= 3 :
227 if parts[2][0] == "[" and parts[2][-1] == "]" :
228 self._header.date = parts[2][1:-1]
229 else :
230
231
232 self._header.date = parts[2]
233
235 """a reference to the article describing the algorithm
236
237 Save this to put on each blast record object
238 """
239 self._header.reference = self._value
240
242 """the database(s) searched
243
244 Save this to put on each blast record object
245 """
246 self._header.database = self._value
247
249 """the identifier of the query
250
251 Important in old pre 2.2.14 BLAST, for recent versions
252 <Iteration_query-ID> is enough
253 """
254 self._header.query_id = self._value
255
257 """the definition line of the query
258
259 Important in old pre 2.2.14 BLAST, for recent versions
260 <Iteration_query-def> is enough
261 """
262 self._header.query = self._value
263
265 """the length of the query
266
267 Important in old pre 2.2.14 BLAST, for recent versions
268 <Iteration_query-len> is enough
269 """
270 self._header.query_letters = int(self._value)
271
273 """the identifier of the query
274 """
275 self._blast.query_id = self._value
276
278 """the definition line of the query
279 """
280 self._blast.query = self._value
281
283 """the length of the query
284 """
285 self._blast.query_letters = int(self._value)
286
287
288
289
290
291
292
293
294
295
296
298 """hits to the database sequences, one for every sequence
299 """
300 self._blast.num_hits = int(self._value)
301
302
303
304
305
306
307
309 """matrix used (-M)
310 """
311 self._parameters.matrix = self._value
312
314 """expect values cutoff (-e)
315 """
316
317
318
319
320
321
322
323 self._parameters.expect = self._value
324
325
326
327
328
329
331 """match score for nucleotide-nucleotide comparaison (-r)
332 """
333 self._parameters.sc_match = int(self._value)
334
336 """mismatch penalty for nucleotide-nucleotide comparaison (-r)
337 """
338 self._parameters.sc_mismatch = int(self._value)
339
341 """gap existence cost (-G)
342 """
343 self._parameters.gap_penalties = int(self._value)
344
346 """gap extension cose (-E)
347 """
348 self._parameters.gap_penalties = (self._parameters.gap_penalties,
349 int(self._value))
350
352 """filtering options (-F)
353 """
354 self._parameters.filter = self._value
355
356
357
358
359
360
361
362
363
364
365
366
368 self._blast.alignments.append(Record.Alignment())
369 self._blast.descriptions.append(Record.Description())
370 self._blast.multiple_alignment = []
371 self._hit = self._blast.alignments[-1]
372 self._descr = self._blast.descriptions[-1]
373 self._descr.num_alignments = 0
374
376
377 self._blast.multiple_alignment = None
378 self._hit = None
379 self._descr = None
380
382 """identifier of the database sequence
383 """
384 self._hit.hit_id = self._value
385 self._hit.title = self._value + ' '
386
388 """definition line of the database sequence
389 """
390 self._hit.hit_def = self._value
391 self._hit.title += self._value
392 self._descr.title = self._hit.title
393
395 """accession of the database sequence
396 """
397 self._hit.accession = self._value
398 self._descr.accession = self._value
399
401 self._hit.length = int(self._value)
402
403
405
406
407 self._hit.hsps.append(Record.HSP())
408 self._hsp = self._hit.hsps[-1]
409 self._descr.num_alignments += 1
410 self._blast.multiple_alignment.append(Record.MultipleAlignment())
411 self._mult_al = self._blast.multiple_alignment[-1]
412
413
415 """raw score of HSP
416 """
417 self._hsp.score = float(self._value)
418 if self._descr.score == None:
419 self._descr.score = float(self._value)
420
422 """bit score of HSP
423 """
424 self._hsp.bits = float(self._value)
425 if self._descr.bits == None:
426 self._descr.bits = float(self._value)
427
429 """expect value value of the HSP
430 """
431 self._hsp.expect = float(self._value)
432 if self._descr.e == None:
433 self._descr.e = float(self._value)
434
436 """offset of query at the start of the alignment (one-offset)
437 """
438 self._hsp.query_start = int(self._value)
439
441 """offset of query at the end of the alignment (one-offset)
442 """
443 self._hsp.query_end = int(self._value)
444
446 """offset of the database at the start of the alignment (one-offset)
447 """
448 self._hsp.sbjct_start = int(self._value)
449
451 """offset of the database at the end of the alignment (one-offset)
452 """
453 self._hsp.sbjct_end = int(self._value)
454
455
456
457
458
459
460
461
462
463
464
466 """frame of the query if applicable
467 """
468 self._hsp.frame = (int(self._value),)
469
471 """frame of the database sequence if applicable
472 """
473 self._hsp.frame += (int(self._value),)
474
476 """number of identities in the alignment
477 """
478 self._hsp.identities = int(self._value)
479
481 """number of positive (conservative) substitutions in the alignment
482 """
483 self._hsp.positives = int(self._value)
484
486 """number of gaps in the alignment
487 """
488 self._hsp.gaps = int(self._value)
489
491 """length of the alignment
492 """
493 self._hsp.align_length = int(self._value)
494
495
496
497
498
499
501 """alignment string for the query
502 """
503 self._hsp.query = self._value
504
506 """alignment string for the database
507 """
508 self._hsp.sbjct = self._value
509
511 """Formatting middle line as normally seen in BLAST report
512 """
513 self._hsp.match = self._value
514
515
520
525
530
535
537 """Karlin-Altschul parameter K
538 """
539 self._blast.ka_params = float(self._value)
540
542 """Karlin-Altschul parameter Lambda
543 """
544 self._blast.ka_params = (float(self._value),
545 self._blast.ka_params)
546
548 """Karlin-Altschul parameter H
549 """
550 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
551
552 -def read(handle, debug=0):
553 """Returns a single Blast record (assumes just one query).
554
555 This function is for use when there is one and only one BLAST
556 result in your XML file.
557
558 Use the Bio.Blast.NCBIXML.parse() function if you expect more than
559 one BLAST record (i.e. if you have more than one query sequence).
560
561 """
562 iterator = parse(handle, debug)
563 try :
564 first = iterator.next()
565 except StopIteration :
566 first = None
567 if first is None :
568 raise ValueError("No records found in handle")
569 try :
570 second = iterator.next()
571 except StopIteration :
572 second = None
573 if second is not None :
574 raise ValueError("More than one record found in handle")
575 return first
576
577
578 -def parse(handle, debug=0):
579 """Returns an iterator a Blast record for each query.
580
581 handle - file handle to and XML file to parse
582 debug - integer, amount of debug information to print
583
584 This is a generator function that returns multiple Blast records
585 objects - one for each query sequence given to blast. The file
586 is read incrementally, returning complete records as they are read
587 in.
588
589 Should cope with new BLAST 2.2.14+ which gives a single XML file
590 for mutliple query records.
591
592 Should also cope with XML output from older versions BLAST which
593 gave multiple XML files concatenated together (giving a single file
594 which strictly speaking wasn't valid XML)."""
595 from xml.parsers import expat
596 BLOCK = 1024
597 MARGIN = 10
598 XML_START = "<?xml"
599
600 text = handle.read(BLOCK)
601 pending = ""
602
603 if not text :
604
605 raise ValueError("Your XML file was empty")
606
607 while text :
608
609 if not text.startswith(XML_START) :
610 raise ValueError("Your XML file did not start with %s..." \
611 % XML_START)
612
613 expat_parser = expat.ParserCreate()
614 blast_parser = BlastParser(debug)
615 expat_parser.StartElementHandler = blast_parser.startElement
616 expat_parser.EndElementHandler = blast_parser.endElement
617 expat_parser.CharacterDataHandler = blast_parser.characters
618
619 expat_parser.Parse(text, False)
620 while blast_parser._records:
621 record = blast_parser._records[0]
622 blast_parser._records = blast_parser._records[1:]
623 yield record
624
625 while True :
626
627 text, pending = pending + handle.read(BLOCK), ""
628 if not text:
629
630 expat_parser.Parse("", True)
631 break
632
633
634
635 pending = handle.read(MARGIN)
636
637 if (text+pending).find("\n" + XML_START) == -1 :
638
639 expat_parser.Parse(text, False)
640 while blast_parser._records:
641 record = blast_parser._records[0]
642 blast_parser._records = blast_parser._records[1:]
643 yield record
644 else :
645
646
647
648
649 text, pending = (text+pending).split("\n" + XML_START,1)
650 pending = XML_START + pending
651
652 expat_parser.Parse(text, True)
653 while blast_parser._records:
654 record = blast_parser._records[0]
655 blast_parser._records = blast_parser._records[1:]
656 yield record
657
658
659
660 text, pending = pending, ""
661 break
662
663
664
665
666 assert pending==""
667 assert len(blast_parser._records) == 0
668
669
670 assert text==""
671 assert pending==""
672 assert len(blast_parser._records) == 0
673
674 if __name__ == '__main__':
675 import sys
676 import os
677 handle = open(sys.argv[1])
678 r_list = parse(handle)
679
680 for r in r_list :
681
682 print 'Blast of', r.query
683 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments),
684 reduce(lambda a,b: a+b,
685 [len(a.hsps) for a in r.alignments]))
686
687 for al in r.alignments:
688 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs'
689
690
691 E_VALUE_THRESH = 0.04
692 for alignment in r.alignments:
693 for hsp in alignment.hsps:
694 if hsp.expect < E_VALUE_THRESH:
695 print '*****'
696 print 'sequence', alignment.title
697 print 'length', alignment.length
698 print 'e value', hsp.expect
699 print hsp.query[:75] + '...'
700 print hsp.match[:75] + '...'
701 print hsp.sbjct[:75] + '...'
702