1
2
3
4
5
6
7 from Bio.Alphabet import IUPAC
8 from Bio import File
9 from Bio.ParserSupport import *
10 from Bio import Seq
11 import re
12 from math import sqrt
13 import sys
14 from Bio.Motif import Motif
15
17 """A subclass of Motif used in parsing MEME (and MAST) output.
18
19 This sublcass defines functions and data specific to MEME motifs.
20 This includes the evalue for a motif and the PSSM of the motif.
21
22 Methods:
23 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values.
24 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies
25 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position.
26 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif
27 """
31
33 if type(number) == int:
34 self.num_occurrences = number
35 else:
36 number = int(number)
37 self.num_occurrences = number
38
44
56
58 if type(evalue) == float:
59 self.evalue = evalue
60 else:
61 evalue = float(evalue)
62 self.evalue = evalue
63
64
66 """A class describing the instances of a MEME motif, and the data thereof.
67 """
76
77
80
83
87
89 pval = float(pval)
90 self.pvalue = pval
91
95
98
101
102
104 """A class for holding the results of a MEME run.
105
106 A MEMERecord is an object that holds the results from running
107 MEME. It implements no methods of its own.
108
109 """
111 """__init__ (self)"""
112 self.motifs = []
113 self.version = ""
114 self.datafile = ""
115 self.command = ""
116 self.alphabet = None
117 self.sequence_names = []
118
120 for m in self.motifs:
121 if m.name == name:
122 return m
123
125 """A parser for the text output of the MEME program.
126 Parses the output into an object of the MEMERecord class.
127
128 Methods:
129 parse (handle): parses the contents of the file handle passed to it.
130
131 Example:
132
133 >>>f = open("meme.output.txt")
134 >>>parser = MEMEParser()
135 >>>meme_record = parser.parse(f)
136 >>>for motif in meme_record.motifs:
137 ... for instance in motif.instances:
138 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
139
140 """
145
146 - def parse (self, handle):
147 """parse (self, handle)"""
148 self._scanner.feed(handle, self._consumer)
149 return self._consumer.data
150
151
152
154 """Scanner for MEME output.
155
156 Methods:
157 feed
158
159 """
160
161 - def feed (self, handle, consumer):
162 """
163 Feeds in MEME output for scanning. handle should
164 implement the readline method. consumer is
165 a Consumer object that can receive the salient events.
166 """
167 if isinstance(handle, File.UndoHandle):
168 uhandle = handle
169 else:
170 uhandle = File.UndoHandle(handle)
171
172 self._scan_header(uhandle, consumer)
173 self._scan_motifs (uhandle, consumer)
174
176 try:
177 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version')
178 except ValueError:
179 raise ValueError("Improper input file. File should contain a line starting MEME version.")
180 read_and_call(uhandle, consumer._version, start = 'MEME version')
181 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET')
182 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET')
183 read_and_call(uhandle, consumer.noevent, start = '****')
184 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE')
185 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET')
186 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
187 read_and_call(uhandle, consumer.noevent, start = '----')
188 read_and_call_until(uhandle, consumer._sequence_name, start = '***')
189 read_and_call_until(uhandle, consumer.noevent, start = 'command:')
190 read_and_call(uhandle, consumer._commandline, start = 'command:')
191 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
192
194 while 1:
195 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF')
196 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value')
197 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value')
198 read_and_call(uhandle, consumer.noevent, start = '---')
199 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
200 read_and_call(uhandle, consumer.noevent, start = '---')
201 read_and_call_until(uhandle, consumer.add_instance, start = '---')
202 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix')
203 read_and_call(uhandle, consumer.noevent)
204 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---')
205 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix')
206 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix')
207 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---')
208 read_and_call_until(uhandle, consumer.noevent, start = 'Time')
209 read_and_call(uhandle, consumer.noevent, start = 'Time')
210 read_and_call(uhandle, consumer.noevent, blank = 1)
211 read_and_call(uhandle, consumer.noevent, start = '***')
212 read_and_call_while(uhandle, consumer.noevent, blank = 1)
213 read_and_call(uhandle, consumer.noevent, start = '***')
214 line = safe_peekline(uhandle)
215 if line.startswith("SUMMARY OF MOTIFS"):
216 break
217
218
219
221 """
222 Consumer that can receive events from MEME Scanner.
223
224 This is the Consumer object that should be passed to the
225 MEME Scanner.
226 """
227
229 self.current_motif = None
230 self.sequence_names = []
231 self.data = MEMERecord()
232
237
239 line = line.strip()
240 line = line.replace('DATAFILE= ','')
241 self.data.datafile = line
242
251
253 line = line.strip()
254 ls = line.split()
255 self.data.sequence_names.append(ls[0])
256 if len(ls) == 6:
257 self.data.sequence_names.append(ls[3])
258
260 line = line.strip()
261 line = line.replace('command: ','')
262 self.data.command = line
263
274
280
290
293
296
299
300
301
303 """
304 Consumer that can receive events from _MASTScanner.
305
306 A _MASTConsumer parses lines from a mast text output file.
307 The motif match diagrams are parsed using line buffering.
308 Each of the buffering functions have a dummy variable that is
309 required for testing using the Bio.ParserSupport.TaggingConsumer.
310 If this variable isn't there, the TaggingConsumer barfs. In
311 the _MASTScanner, None is passed in the place of this variable.
312 """
314 self.data = MASTRecord()
315 self._current_seq = ""
316 self._line_buffer = []
317 self._buffer_size = 0
318 self._buffered_seq_start = 0
319
324
336
347
371
396
422
428
430 line = line.strip()
431 if not line.startswith('*****'):
432 self._line_buffer.append(line)
433 else:
434 return -1
435
437 """Parses the line buffer to get e-values for each instance of a motif.
438 This buffer parser is the most likely point of failure for the
439 MASTParser.
440 """
441 insts = self.data.get_motif_matches_for_sequence(self._current_seq)
442 if len(insts) > 0:
443
444 fullSeq = self._line_buffer[self._buffer_size-1]
445 pvals = self._line_buffer[1].split()
446 p = 0
447 lpval = len(pvals)
448 while p < lpval:
449 if pvals[p].count('e') > 1:
450
451
452 pvs = []
453 spe = pvals[p].split('e')
454 spe.reverse()
455 dotind = spe[1].find('.')
456 if dotind == -1:
457 thispval = spe[1][-1] + 'e' + spe[0]
458 else:
459 thispval = spe[1][dotind-1:] + 'e' + spe[0]
460 pvs.append(thispval)
461 for spi in range(2,len(spe)):
462 dotind = spe[spi].find('.')
463 prevdotind = spe[spi-1].find('.')
464 if dotind != -1:
465 if prevdotind == -1:
466 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1]
467 else:
468 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1]
469 else:
470 if prevdotind == -1:
471 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1]
472 else:
473 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1]
474 pvs.append(thispval)
475 pvs.reverse()
476 if p > 0:
477 pvals = pvals[0:p] + pvs + pvals[p+1:]
478 else:
479 pvals = pvs + pvals[p+1:]
480 lpval = len(pvals)
481 p += 1
482 i = 0
483 if len(pvals) != len(insts):
484 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n")
485 pvals = []
486
487
488 for i in range(0,len(insts)):
489 inst = insts[i]
490 start = inst.start - self._buffered_seq_start + 1
491 thisSeq = fullSeq[start:start+inst.length]
492 thisSeq = Seq.Seq(thisSeq, self.data.alphabet)
493 inst._sequence(thisSeq)
494 if pvals:
495 inst._pvalue(float(pvals[i]))
496
498 self._line_buffer = []
499 self._buffer_size = 0
500
502 if self._buffer_size == 0:
503 if len(self._line_buffer) > 0:
504 self._buffer_size = len(self._line_buffer)
505 ll = self._line_buffer[self._buffer_size-1].split()
506 self._line_buffer[self._buffer_size-1] = ll[1]
507 self._buffered_seq_start = int(ll[0])
508 else:
509 i = 0
510 for i in range(self._buffer_size, len(self._line_buffer)-1):
511 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip()
512 ll = self._line_buffer[len(self._line_buffer)-1].split()
513 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]):
514 self._line_buffer[self._buffer_size-1] += ll[1]
515 else:
516 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]))
517 self._line_buffer[self._buffer_size-1] += "N"*differ
518 self._line_buffer[self._buffer_size-1] += ll[1]
519 self._line_buffer = self._line_buffer[0:self._buffer_size]
520
522 line = line.strip()
523 if line.find('[') != -1 or line.find('<') != -1:
524 pass
525 elif line.find('e') != -1:
526 pass
527 elif line.find('+') != -1:
528 pass
529
532
533
534
536 """
537 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord
538
539 A MASTParser takes a file handle for a MAST text output file and
540 returns a MASTRecord, containing the hits between motifs and
541 sequences. The parser does some unusual line buffering to parse out
542 match diagrams. Really complex diagrams often lead to an error message
543 and p-values not being parsed for a given line.
544
545 Methods:
546 parse (handle): parses the data from the file handle passed to it.
547
548 Example:
549
550 >>>f = open("mast_file.txt")
551 >>>parser = MASTParser()
552 >>>mast_record = parser.parse(f)
553 >>>for motif in mast_record.motifs:
554 >>> for instance in motif.instances:
555 >>> print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
556 """
560
561 - def parse (self, handle):
562 self._scanner.feed(handle, self._consumer)
563 return self._consumer.data
564
565
566
568 """
569 Scanner for MAST text output.
570
571 """
572 - def feed (self, handle, consumer):
581
583 try:
584 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version")
585 except ValueError:
586 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'")
587 read_and_call(uhandle, consumer._version, contains = 'MAST version')
588 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS')
589 read_and_call(uhandle, consumer.noevent, start = 'DATABASE')
590 read_and_call(uhandle, consumer.noevent, start = '****')
591 read_and_call(uhandle, consumer._database, contains = 'DATABASE')
592 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH')
593 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF')
594 read_and_call(uhandle, consumer.noevent, contains = '----')
595 read_and_call_until(uhandle, consumer._add_motif, blank = 1)
596 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
597
599 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
600 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
601 read_and_call(uhandle, consumer.noevent, start = '---')
602
603 read_and_call_until(uhandle, consumer.noevent, blank = 1)
604 read_and_call(uhandle, consumer.noevent, blank = 1)
605
607 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:')
608 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:')
609 read_and_call_until(uhandle, consumer.noevent, start = '****')
610 read_and_call(uhandle, consumer.noevent, start = '****')
611 read_and_call_until(uhandle, consumer.noevent, start = '*****')
612 read_and_call(uhandle, consumer.noevent)
613 read_and_call_while(uhandle, consumer.noevent, blank = 1)
614 readMatches = 1
615 while readMatches == 1:
616 if consumer._current_seq:
617 if consumer._buffer_size != 0:
618 consumer._parse_buffer(None)
619 consumer._blank_buffer(None)
620 read_and_call(uhandle, consumer._set_current_seq)
621 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM')
622 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
623 consumer._add_diagram_from_buffer(None)
624 consumer._blank_buffer(None)
625 read_and_call(uhandle, consumer.noevent, blank = 1)
626 while 1:
627 line = safe_peekline(uhandle)
628 if line.startswith('****'):
629 consumer._parse_buffer(None)
630 readMatches = 0
631 break
632 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
633 read_and_call(uhandle, consumer.noevent, blank = 1)
634 consumer._collapse_buffer(None)
635 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1):
636 break
637 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'):
638 consumer._parse_buffer(None)
639 consumer._blank_buffer(None)
640 readMatches = 0
641 break
642
643
644
646 """The class for holding the results from a MAST run.
647
648 A MASTRecord holds data about matches between motifs and sequences.
649 The motifs held by the MASTRecord are objects of the class MEMEMotif.
650
651 Methods:
652 get_motif_matches_for_sequence(sequence_name): returns all of the
653 motif matches within a given sequence. The matches are objects of
654 the class MEMEInstance
655 get_motif_matches (motif_name): returns all of the matches for a motif
656 in the sequences searched. The matches returned are of class
657 MEMEInstance
658 get_motif_by_name (motif_name): returns a MEMEMotif with the given
659 name.
660 """
669
672
678
681
690
694
696 self.diagrams[seq] = diagram
697
700
703
706
708 for m in self.motifs:
709 if m.name == name:
710 return m
711