1
2
3
4
5
6
7 """This module provides code to work with the BLAST XML output
8 following the DTD available on the NCBI FTP
9 ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
10
11 Classes:
12 BlastParser Parses XML output from BLAST (direct use discouraged).
13 This (now) returns a list of Blast records.
14 Historically it returned a single Blast record.
15 You are expected to use this via the parse function.
16
17 _XMLParser Generic SAX parser (private).
18
19 Functions:
20 parse Incremental parser, this is an iterator that returns
21 Blast records. It uses the BlastParser internally.
22 """
23 from Bio.Blast import Record
24 import xml.sax
25 from xml.sax.handler import ContentHandler
26
28 """Generic SAX Parser
29
30 Just a very basic SAX parser.
31
32 Redefine the methods startElement, characters and endElement.
33 """
35 """Constructor
36
37 debug - integer, amount of debug information to print
38 """
39 self._tag = []
40 self._value = ''
41 self._debug = debug
42 self._debug_ignore_list = []
43
45 """Removes 'dangerous' from tag names
46
47 name -- name to be 'secured'
48 """
49
50 return name.replace('-', '_')
51
53 """Found XML start tag
54
55 No real need of attr, BLAST DTD doesn't use them
56
57 name -- name of the tag
58
59 attr -- tag attributes
60 """
61 self._tag.append(name)
62
63
64 method = self._secure_name('_start_' + name)
65
66
67
68 if hasattr(self, method) :
69 eval("self.%s()" % method)
70 if self._debug > 4 :
71 print "NCBIXML: Parsed: " + method
72 else :
73
74 if method not in self._debug_ignore_list :
75 if self._debug > 3 :
76 print "NCBIXML: Ignored: " + method
77 self._debug_ignore_list.append(method)
78
79
80
81 if self._value.strip() :
82 raise ValueError("What should we do with %s before the %s tag?" \
83 % (repr(self._value), name))
84 self._value = ""
85
87 """Found some text
88
89 ch -- characters read
90 """
91 self._value += ch
92
94 """Found XML end tag
95
96 name -- tag name
97 """
98
99
100
101 method = self._secure_name('_end_' + name)
102
103
104 if hasattr(self, method) :
105 eval("self.%s()" % method)
106 if self._debug > 2 :
107 print "NCBIXML: Parsed: " + method, self._value
108 else :
109
110 if method not in self._debug_ignore_list :
111 if self._debug > 1 :
112 print "NCBIXML: Ignored: " + method, self._value
113 self._debug_ignore_list.append(method)
114
115
116 self._value = ''
117
119 """Parse XML BLAST data into a Record.Blast object
120
121 All XML 'action' methods are private methods and may be:
122 _start_TAG called when the start tag is found
123 _end_TAG called when the end tag is found
124 """
125
127 """Constructor
128
129 debug - integer, amount of debug information to print
130 """
131
132 _XMLparser.__init__(self, debug)
133
134 self._parser = xml.sax.make_parser()
135 self._parser.setContentHandler(self)
136
137
138 self._parser.setFeature(xml.sax.handler.feature_validation, 0)
139 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
140 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
141 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
142
143 self.reset()
144
146 """Reset all the data allowing reuse of the BlastParser() object"""
147 self._records = []
148 self._header = Record.Header()
149 self._parameters = Record.Parameters()
150 self._parameters.filter = None
151
155
157
158
159 self._blast.reference = self._header.reference
160 self._blast.date = self._header.date
161 self._blast.version = self._header.version
162 self._blast.database = self._header.database
163 self._blast.application = self._header.application
164
165
166
167
168
169
170 if not hasattr(self._blast, "query") \
171 or not self._blast.query :
172 self._blast.query = self._header.query
173 if not hasattr(self._blast, "query_id") \
174 or not self._blast.query_id :
175 self._blast.query_id = self._header.query_id
176 if not hasattr(self._blast, "query_letters") \
177 or not self._blast.query_letters :
178 self._blast.query_letters = self._header.query_letters
179
180
181
182
183 self._blast.query_length = self._blast.query_letters
184
185
186
187
188
189
190 self._blast.database_length = self._blast.num_letters_in_database
191
192
193
194 self._blast.matrix = self._parameters.matrix
195 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
196 self._blast.gap_penalties = self._parameters.gap_penalties
197 self._blast.filter = self._parameters.filter
198 self._blast.expect = self._parameters.expect
199 self._blast.sc_match = self._parameters.sc_match
200 self._blast.sc_mismatch = self._parameters.sc_mismatch
201
202
203 self._records.append(self._blast)
204
205 self._blast = None
206
207 if self._debug : "NCBIXML: Added Blast record to results"
208
209
211 """BLAST program, e.g., blastp, blastn, etc.
212
213 Save this to put on each blast record object
214 """
215 self._header.application = self._value.upper()
216
218 """version number and date of the BLAST engine.
219
220 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
221 variants like "BLASTP 2.2.18+" without the date.
222
223 Save this to put on each blast record object
224 """
225 parts = self._value.split()
226
227
228
229 self._header.version = parts[1]
230
231
232 if len(parts) >= 3 :
233 if parts[2][0] == "[" and parts[2][-1] == "]" :
234 self._header.date = parts[2][1:-1]
235 else :
236
237
238 self._header.date = parts[2]
239
241 """a reference to the article describing the algorithm
242
243 Save this to put on each blast record object
244 """
245 self._header.reference = self._value
246
248 """the database(s) searched
249
250 Save this to put on each blast record object
251 """
252 self._header.database = self._value
253
255 """the identifier of the query
256
257 Important in old pre 2.2.14 BLAST, for recent versions
258 <Iteration_query-ID> is enough
259 """
260 self._header.query_id = self._value
261
263 """the definition line of the query
264
265 Important in old pre 2.2.14 BLAST, for recent versions
266 <Iteration_query-def> is enough
267 """
268 self._header.query = self._value
269
271 """the length of the query
272
273 Important in old pre 2.2.14 BLAST, for recent versions
274 <Iteration_query-len> is enough
275 """
276 self._header.query_letters = int(self._value)
277
279 """the identifier of the query
280 """
281 self._blast.query_id = self._value
282
284 """the definition line of the query
285 """
286 self._blast.query = self._value
287
289 """the length of the query
290 """
291 self._blast.query_letters = int(self._value)
292
293
294
295
296
297
298
299
300
301
302
304 """hits to the database sequences, one for every sequence
305 """
306 self._blast.num_hits = int(self._value)
307
308
309
310
311
312
313
315 """matrix used (-M)
316 """
317 self._parameters.matrix = self._value
318
320 """expect values cutoff (-e)
321 """
322
323
324
325
326
327
328
329 self._parameters.expect = self._value
330
331
332
333
334
335
337 """match score for nucleotide-nucleotide comparaison (-r)
338 """
339 self._parameters.sc_match = int(self._value)
340
342 """mismatch penalty for nucleotide-nucleotide comparaison (-r)
343 """
344 self._parameters.sc_mismatch = int(self._value)
345
347 """gap existence cost (-G)
348 """
349 self._parameters.gap_penalties = int(self._value)
350
352 """gap extension cose (-E)
353 """
354 self._parameters.gap_penalties = (self._parameters.gap_penalties,
355 int(self._value))
356
358 """filtering options (-F)
359 """
360 self._parameters.filter = self._value
361
362
363
364
365
366
367
368
369
370
371
372
374 self._blast.alignments.append(Record.Alignment())
375 self._blast.descriptions.append(Record.Description())
376 self._blast.multiple_alignment = []
377 self._hit = self._blast.alignments[-1]
378 self._descr = self._blast.descriptions[-1]
379 self._descr.num_alignments = 0
380
382
383 self._blast.multiple_alignment = None
384 self._hit = None
385 self._descr = None
386
388 """identifier of the database sequence
389 """
390 self._hit.hit_id = self._value
391 self._hit.title = self._value + ' '
392
394 """definition line of the database sequence
395 """
396 self._hit.hit_def = self._value
397 self._hit.title += self._value
398 self._descr.title = self._hit.title
399
401 """accession of the database sequence
402 """
403 self._hit.accession = self._value
404 self._descr.accession = self._value
405
407 self._hit.length = int(self._value)
408
409
411
412
413 self._hit.hsps.append(Record.HSP())
414 self._hsp = self._hit.hsps[-1]
415 self._descr.num_alignments += 1
416 self._blast.multiple_alignment.append(Record.MultipleAlignment())
417 self._mult_al = self._blast.multiple_alignment[-1]
418
419
421 """raw score of HSP
422 """
423 self._hsp.score = float(self._value)
424 if self._descr.score == None:
425 self._descr.score = float(self._value)
426
428 """bit score of HSP
429 """
430 self._hsp.bits = float(self._value)
431 if self._descr.bits == None:
432 self._descr.bits = float(self._value)
433
435 """expect value value of the HSP
436 """
437 self._hsp.expect = float(self._value)
438 if self._descr.e == None:
439 self._descr.e = float(self._value)
440
442 """offset of query at the start of the alignment (one-offset)
443 """
444 self._hsp.query_start = int(self._value)
445
447 """offset of query at the end of the alignment (one-offset)
448 """
449 self._hsp.query_end = int(self._value)
450
452 """offset of the database at the start of the alignment (one-offset)
453 """
454 self._hsp.sbjct_start = int(self._value)
455
457 """offset of the database at the end of the alignment (one-offset)
458 """
459 self._hsp.sbjct_end = int(self._value)
460
461
462
463
464
465
466
467
468
469
470
472 """frame of the query if applicable
473 """
474 self._hsp.frame = (int(self._value),)
475
477 """frame of the database sequence if applicable
478 """
479 self._hsp.frame += (int(self._value),)
480
482 """number of identities in the alignment
483 """
484 self._hsp.identities = int(self._value)
485
487 """number of positive (conservative) substitutions in the alignment
488 """
489 self._hsp.positives = int(self._value)
490
492 """number of gaps in the alignment
493 """
494 self._hsp.gaps = int(self._value)
495
497 """length of the alignment
498 """
499 self._hsp.align_length = int(self._value)
500
501
502
503
504
505
507 """alignment string for the query
508 """
509 self._hsp.query = self._value
510
512 """alignment string for the database
513 """
514 self._hsp.sbjct = self._value
515
517 """Formatting middle line as normally seen in BLAST report
518 """
519 self._hsp.match = self._value
520 assert len(self._hsp.match)==len(self._hsp.query)
521 assert len(self._hsp.match)==len(self._hsp.sbjct)
522
523
528
533
538
543
545 """Karlin-Altschul parameter K
546 """
547 self._blast.ka_params = float(self._value)
548
550 """Karlin-Altschul parameter Lambda
551 """
552 self._blast.ka_params = (float(self._value),
553 self._blast.ka_params)
554
556 """Karlin-Altschul parameter H
557 """
558 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
559
560 -def read(handle, debug=0):
561 """Returns a single Blast record (assumes just one query).
562
563 This function is for use when there is one and only one BLAST
564 result in your XML file.
565
566 Use the Bio.Blast.NCBIXML.parse() function if you expect more than
567 one BLAST record (i.e. if you have more than one query sequence).
568
569 """
570 iterator = parse(handle, debug)
571 try :
572 first = iterator.next()
573 except StopIteration :
574 first = None
575 if first is None :
576 raise ValueError("No records found in handle")
577 try :
578 second = iterator.next()
579 except StopIteration :
580 second = None
581 if second is not None :
582 raise ValueError("More than one record found in handle")
583 return first
584
585
586 -def parse(handle, debug=0):
587 """Returns an iterator a Blast record for each query.
588
589 handle - file handle to and XML file to parse
590 debug - integer, amount of debug information to print
591
592 This is a generator function that returns multiple Blast records
593 objects - one for each query sequence given to blast. The file
594 is read incrementally, returning complete records as they are read
595 in.
596
597 Should cope with new BLAST 2.2.14+ which gives a single XML file
598 for mutliple query records.
599
600 Should also cope with XML output from older versions BLAST which
601 gave multiple XML files concatenated together (giving a single file
602 which strictly speaking wasn't valid XML)."""
603 from xml.parsers import expat
604 BLOCK = 1024
605 MARGIN = 10
606 XML_START = "<?xml"
607
608 text = handle.read(BLOCK)
609 pending = ""
610
611 if not text :
612
613 raise ValueError("Your XML file was empty")
614
615 while text :
616
617 if not text.startswith(XML_START) :
618 raise ValueError("Your XML file did not start with %s..." \
619 % XML_START)
620
621 expat_parser = expat.ParserCreate()
622 blast_parser = BlastParser(debug)
623 expat_parser.StartElementHandler = blast_parser.startElement
624 expat_parser.EndElementHandler = blast_parser.endElement
625 expat_parser.CharacterDataHandler = blast_parser.characters
626
627 expat_parser.Parse(text, False)
628 while blast_parser._records:
629 record = blast_parser._records[0]
630 blast_parser._records = blast_parser._records[1:]
631 yield record
632
633 while True :
634
635 text, pending = pending + handle.read(BLOCK), ""
636 if not text:
637
638 expat_parser.Parse("", True)
639 break
640
641
642
643 pending = handle.read(MARGIN)
644
645 if (text+pending).find("\n" + XML_START) == -1 :
646
647 expat_parser.Parse(text, False)
648 while blast_parser._records:
649 record = blast_parser._records[0]
650 blast_parser._records = blast_parser._records[1:]
651 yield record
652 else :
653
654
655
656
657 text, pending = (text+pending).split("\n" + XML_START,1)
658 pending = XML_START + pending
659
660 expat_parser.Parse(text, True)
661 while blast_parser._records:
662 record = blast_parser._records[0]
663 blast_parser._records = blast_parser._records[1:]
664 yield record
665
666
667
668 text, pending = pending, ""
669 break
670
671
672
673
674 assert pending==""
675 assert len(blast_parser._records) == 0
676
677
678 assert text==""
679 assert pending==""
680 assert len(blast_parser._records) == 0
681
682 if __name__ == '__main__':
683 import sys
684 import os
685 handle = open(sys.argv[1])
686 r_list = parse(handle)
687
688 for r in r_list :
689
690 print 'Blast of', r.query
691 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments),
692 reduce(lambda a,b: a+b,
693 [len(a.hsps) for a in r.alignments]))
694
695 for al in r.alignments:
696 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs'
697
698
699 E_VALUE_THRESH = 0.04
700 for alignment in r.alignments:
701 for hsp in alignment.hsps:
702 if hsp.expect < E_VALUE_THRESH:
703 print '*****'
704 print 'sequence', alignment.title
705 print 'length', alignment.length
706 print 'e value', hsp.expect
707 print hsp.query[:75] + '...'
708 print hsp.match[:75] + '...'
709 print hsp.sbjct[:75] + '...'
710