1
2
3
4
5
6 from Bio.Alphabet import IUPAC
7 from Bio import File
8 from Bio.ParserSupport import *
9 from Bio import Seq
10 from Bio.MEME import Motif
11 import re
12
14 """A class for holding the results of a MEME run.
15
16 A MEMERecord is an object that holds the results from running
17 MEME. It implements no methods of its own.
18
19 """
21 """__init__ (self)"""
22 self.motifs = []
23 self.version = ""
24 self.datafile = ""
25 self.command = ""
26 self.alphabet = None
27 self.sequence_names = []
28
30 for m in self.motifs:
31 if m.name == name:
32 return m
33
35 """A parser for the text output of the MEME program.
36 Parses the output into an object of the MEMERecord class.
37
38 Methods:
39 parse (handle): parses the contents of the file handle passed to it.
40
41 Example:
42
43 f = open("meme.output.txt")
44 parser = MEMEParser()
45 meme_record = parser.parse(f)
46 for motif in meme_record.motifs:
47 for instance in motif.instances:
48 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
49
50 """
55
56 - def parse (self, handle):
57 """parse (self, handle)"""
58 self._scanner.feed(handle, self._consumer)
59 return self._consumer.data
60
61
62
64 """Scanner for MEME output.
65
66 Methods:
67 feed
68
69 """
70
71 - def feed (self, handle, consumer):
72 """
73 Feeds in MEME output for scanning. handle should
74 implement the readline method. consumer is
75 a Consumer object that can receive the salient events.
76 """
77 if isinstance(handle, File.UndoHandle):
78 uhandle = handle
79 else:
80 uhandle = File.UndoHandle(handle)
81
82 self._scan_header(uhandle, consumer)
83 self._scan_motifs (uhandle, consumer)
84
86 try:
87 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version')
88 except ValueError:
89 raise ValueError("Improper input file. File should contain a line starting MEME version.")
90 read_and_call(uhandle, consumer._version, start = 'MEME version')
91 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET')
92 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET')
93 read_and_call(uhandle, consumer.noevent, start = '****')
94 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE')
95 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET')
96 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
97 read_and_call(uhandle, consumer.noevent, start = '----')
98 read_and_call_until(uhandle, consumer._sequence_name, start = '***')
99 read_and_call_until(uhandle, consumer.noevent, start = 'command:')
100 read_and_call(uhandle, consumer._commandline, start = 'command:')
101 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
102
104 while 1:
105 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF')
106 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value')
107 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value')
108 read_and_call(uhandle, consumer.noevent, start = '---')
109 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
110 read_and_call(uhandle, consumer.noevent, start = '---')
111 read_and_call_until(uhandle, consumer.add_instance, start = '---')
112 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix')
113 read_and_call(uhandle, consumer.noevent)
114 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---')
115 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix')
116 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix')
117 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---')
118 read_and_call_until(uhandle, consumer.noevent, start = 'Time')
119 read_and_call(uhandle, consumer.noevent, start = 'Time')
120 read_and_call(uhandle, consumer.noevent, blank = 1)
121 read_and_call(uhandle, consumer.noevent, start = '***')
122 read_and_call_while(uhandle, consumer.noevent, blank = 1)
123 read_and_call(uhandle, consumer.noevent, start = '***')
124 line = safe_peekline(uhandle)
125 if line.startswith("SUMMARY OF MOTIFS"):
126 break
127
128
129
131 """
132 Consumer that can receive events from MEME Scanner.
133
134 This is the Consumer object that should be passed to the
135 MEME Scanner.
136 """
137
139 self.current_motif = None
140 self.sequence_names = []
141 self.data = MEMERecord()
142
147
149 line = line.strip()
150 line = line.replace('DATAFILE= ','')
151 self.data.datafile = line
152
161
163 line = line.strip()
164 ls = line.split()
165 self.data.sequence_names.append(ls[0])
166 if len(ls) == 6:
167 self.data.sequence_names.append(ls[3])
168
170 line = line.strip()
171 line = line.replace('command: ','')
172 self.data.command = line
173
184
190
200
202 line = line.strip()
203 sl = line.split()
204 thisposition = tuple([float(i) for i in sl])
205 self.current_motif.add_to_pssm(thisposition)
206
208 line = line.strip()
209 sl = line.split()
210 thisposition = tuple([float(i) for i in sl])
211 self.current_motif.add_to_logodds(thisposition)
212
215
216
217
219 """
220 Consumer that can receive events from _MASTScanner.
221
222 A _MASTConsumer parses lines from a mast text output file.
223 The motif match diagrams are parsed using line buffering.
224 Each of the buffering functions have a dummy variable that is
225 required for testing using the Bio.ParserSupport.TaggingConsumer.
226 If this variable isn't there, the TaggingConsumer barfs. In
227 the _MASTScanner, None is passed in the place of this variable.
228 """
230 self.data = MASTRecord()
231 self._current_seq = ""
232 self._line_buffer = []
233 self._buffer_size = 0
234 self._buffered_seq_start = 0
235
240
252
263
287
312
338
344
346 line = line.strip()
347 if not line.startswith('*****'):
348 self._line_buffer.append(line)
349 else:
350 return -1
351
353 """Parses the line buffer to get e-values for each instance of a motif.
354 This buffer parser is the most likely point of failure for the
355 MASTParser.
356 """
357 insts = self.data.get_motif_matches_for_sequence(self._current_seq)
358 if len(insts) > 0:
359
360 fullSeq = self._line_buffer[self._buffer_size-1]
361 pvals = self._line_buffer[1].split()
362 p = 0
363 lpval = len(pvals)
364 while p < lpval:
365 if pvals[p].count('e') > 1:
366
367
368 pvs = []
369 spe = pvals[p].split('e')
370 spe.reverse()
371 dotind = spe[1].find('.')
372 if dotind == -1:
373 thispval = spe[1][-1] + 'e' + spe[0]
374 else:
375 thispval = spe[1][dotind-1:] + 'e' + spe[0]
376 pvs.append(thispval)
377 for spi in range(2,len(spe)):
378 dotind = spe[spi].find('.')
379 prevdotind = spe[spi-1].find('.')
380 if dotind != -1:
381 if prevdotind == -1:
382 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1]
383 else:
384 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1]
385 else:
386 if prevdotind == -1:
387 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1]
388 else:
389 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1]
390 pvs.append(thispval)
391 pvs.reverse()
392 if p > 0:
393 pvals = pvals[0:p] + pvs + pvals[p+1:]
394 else:
395 pvals = pvs + pvals[p+1:]
396 lpval = len(pvals)
397 p += 1
398 i = 0
399 if len(pvals) != len(insts):
400 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n")
401 pvals = []
402
403
404 for i in range(0,len(insts)):
405 inst = insts[i]
406 start = inst.start - self._buffered_seq_start + 1
407 thisSeq = fullSeq[start:start+inst.length]
408 thisSeq = Seq.Seq(thisSeq, self.data.alphabet)
409 inst._sequence(thisSeq)
410 if pvals:
411 inst._pvalue(float(pvals[i]))
412
414 self._line_buffer = []
415 self._buffer_size = 0
416
418 if self._buffer_size == 0:
419 if len(self._line_buffer) > 0:
420 self._buffer_size = len(self._line_buffer)
421 ll = self._line_buffer[self._buffer_size-1].split()
422 self._line_buffer[self._buffer_size-1] = ll[1]
423 self._buffered_seq_start = int(ll[0])
424 else:
425 i = 0
426 for i in range(self._buffer_size, len(self._line_buffer)-1):
427 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip()
428 ll = self._line_buffer[len(self._line_buffer)-1].split()
429 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]):
430 self._line_buffer[self._buffer_size-1] += ll[1]
431 else:
432 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]))
433 self._line_buffer[self._buffer_size-1] += "N"*differ
434 self._line_buffer[self._buffer_size-1] += ll[1]
435 self._line_buffer = self._line_buffer[0:self._buffer_size]
436
438 line = line.strip()
439 if line.find('[') != -1 or line.find('<') != -1:
440 pass
441 elif line.find('e') != -1:
442 pass
443 elif line.find('+') != -1:
444 pass
445
448
449
450
452 """
453 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord
454
455 A MASTParser takes a file handle for a MAST text output file and
456 returns a MASTRecord, containing the hits between motifs and
457 sequences. The parser does some unusual line buffering to parse out
458 match diagrams. Really complex diagrams often lead to an error message
459 and p-values not being parsed for a given line.
460
461 Methods:
462 parse (handle): parses the data from the file handle passed to it.
463
464 Example:
465
466 f = open("mast_file.txt")
467 parser = MASTParser()
468 mast_record = parser.parse(f)
469 for motif in mast_record.motifs:
470 for instance in motif.instances:
471 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
472 """
476
477 - def parse (self, handle):
478 self._scanner.feed(handle, self._consumer)
479 return self._consumer.data
480
481
482
484 """
485 Scanner for MAST text output.
486
487 """
488 - def feed (self, handle, consumer):
497
499 try:
500 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version")
501 except ValueError:
502 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'")
503 read_and_call(uhandle, consumer._version, contains = 'MAST version')
504 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS')
505 read_and_call(uhandle, consumer.noevent, start = 'DATABASE')
506 read_and_call(uhandle, consumer.noevent, start = '****')
507 read_and_call(uhandle, consumer._database, contains = 'DATABASE')
508 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH')
509 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF')
510 read_and_call(uhandle, consumer.noevent, contains = '----')
511 read_and_call_until(uhandle, consumer._add_motif, blank = 1)
512 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
513
515 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
516 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
517 read_and_call(uhandle, consumer.noevent, start = '---')
518
519 read_and_call_until(uhandle, consumer.noevent, blank = 1)
520 read_and_call(uhandle, consumer.noevent, blank = 1)
521
523 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:')
524 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:')
525 read_and_call_until(uhandle, consumer.noevent, start = '****')
526 read_and_call(uhandle, consumer.noevent, start = '****')
527 read_and_call_until(uhandle, consumer.noevent, start = '*****')
528 read_and_call(uhandle, consumer.noevent)
529 read_and_call_while(uhandle, consumer.noevent, blank = 1)
530 readMatches = 1
531 while readMatches == 1:
532 if consumer._current_seq:
533 if consumer._buffer_size != 0:
534 consumer._parse_buffer(None)
535 consumer._blank_buffer(None)
536 read_and_call(uhandle, consumer._set_current_seq)
537 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM')
538 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
539 consumer._add_diagram_from_buffer(None)
540 consumer._blank_buffer(None)
541 read_and_call(uhandle, consumer.noevent, blank = 1)
542 while 1:
543 line = safe_peekline(uhandle)
544 if line.startswith('****'):
545 consumer._parse_buffer(None)
546 readMatches = 0
547 break
548 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
549 read_and_call(uhandle, consumer.noevent, blank = 1)
550 consumer._collapse_buffer(None)
551 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1):
552 break
553 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'):
554 consumer._parse_buffer(None)
555 consumer._blank_buffer(None)
556 readMatches = 0
557 break
558
559
560
562 """The class for holding the results from a MAST run.
563
564 A MASTRecord holds data about matches between motifs and sequences.
565 The motifs held by the MASTRecord are objects of the class MEMEMotif.
566
567 Methods:
568 get_motif_matches_for_sequence(sequence_name): returns all of the
569 motif matches within a given sequence. The matches are objects of
570 the class MEME.Motif.Instance
571 get_motif_matches (motif_name): returns all of the matches for a motif
572 in the sequences searched. The matches returned are of class
573 MEME.Motif.Instance
574 get_motif_by_name (motif_name): returns a MEMEMotif with the given
575 name.
576 """
585
588
594
597
606
610
612 self.diagrams[seq] = diagram
613
616
619
622
624 for m in self.motifs:
625 if m.name == name:
626 return m
627