Package Bio :: Package MEME :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.MEME.Parser

  1  # Copyright 2004 by Jason A. Hackney.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from string import join 
  7  from Bio.Alphabet import IUPAC 
  8  from Bio import File 
  9  from Bio.ParserSupport import * 
 10  from Bio import Seq 
 11  from Bio.MEME import Motif 
 12  import re 
 13   
14 -class MEMERecord:
15 """A class for holding the results of a MEME run. 16 17 A MEMERecord is an object that holds the results from running 18 MEME. It implements no methods of its own. 19 20 """
21 - def __init__ (self):
22 """__init__ (self)""" 23 self.motifs = [] 24 self.version = "" 25 self.datafile = "" 26 self.command = "" 27 self.alphabet = None 28 self.sequence_names = []
29
30 - def get_motif_by_name (self, name):
31 for m in self.motifs: 32 if m.name == name: 33 return m
34
35 -class MEMEParser (AbstractParser):
36 """A parser for the text output of the MEME program. 37 Parses the output into an object of the MEMERecord class. 38 39 Methods: 40 parse (handle): parses the contents of the file handle passed to it. 41 42 Example: 43 44 f = open("meme.output.txt") 45 parser = MEMEParser() 46 meme_record = parser.parse(f) 47 for motif in meme_record.motifs: 48 for instance in motif.instances: 49 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 50 51 """
52 - def __init__ (self):
53 """__init__ (self)""" 54 self._scanner = _MEMEScanner() 55 self._consumer = _MEMEConsumer()
56
57 - def parse (self, handle):
58 """parse (self, handle)""" 59 self._scanner.feed(handle, self._consumer) 60 return self._consumer.data
61 62 63
64 -class _MEMEScanner:
65 """Scanner for MEME output. 66 67 Methods: 68 feed 69 70 """ 71
72 - def feed (self, handle, consumer):
73 """ 74 Feeds in MEME output for scanning. handle should 75 implement the readline method. consumer is 76 a Consumer object that can receive the salient events. 77 """ 78 if isinstance(handle, File.UndoHandle): 79 uhandle = handle 80 else: 81 uhandle = File.UndoHandle(handle) 82 83 self._scan_header(uhandle, consumer) 84 self._scan_motifs (uhandle, consumer)
85
86 - def _scan_header(self, uhandle, consumer):
87 try: 88 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version') 89 except ValueError: 90 raise ValueError, "Improper input file. File should contain a line starting MEME version." 91 read_and_call(uhandle, consumer._version, start = 'MEME version') 92 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET') 93 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET') 94 read_and_call(uhandle, consumer.noevent, start = '****') 95 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE') 96 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET') 97 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 98 read_and_call(uhandle, consumer.noevent, start = '----') 99 read_and_call_until(uhandle, consumer._sequence_name, start = '***') 100 read_and_call_until(uhandle, consumer.noevent, start = 'command:') 101 read_and_call(uhandle, consumer._commandline, start = 'command:') 102 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
103
104 - def _scan_motifs(self, uhandle, consumer):
105 while 1: 106 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF') 107 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value') 108 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value') 109 read_and_call(uhandle, consumer.noevent, start = '---') 110 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 111 read_and_call(uhandle, consumer.noevent, start = '---') 112 read_and_call_until(uhandle, consumer.add_instance, start = '---') 113 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix') 114 read_and_call(uhandle, consumer.noevent) 115 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---') 116 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix') 117 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix') 118 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---') 119 read_and_call_until(uhandle, consumer.noevent, start = 'Time') 120 read_and_call(uhandle, consumer.noevent, start = 'Time') 121 read_and_call(uhandle, consumer.noevent, blank = 1) 122 read_and_call(uhandle, consumer.noevent, start = '***') 123 read_and_call_while(uhandle, consumer.noevent, blank = 1) 124 read_and_call(uhandle, consumer.noevent, start = '***') 125 line = safe_peekline(uhandle) 126 if line.startswith("SUMMARY OF MOTIFS"): 127 break
128 129 130
131 -class _MEMEConsumer:
132 """ 133 Consumer that can receive events from MEME Scanner. 134 135 This is the Consumer object that should be passed to the 136 MEME Scanner. 137 """ 138
139 - def __init__ (self):
140 self.current_motif = None 141 self.sequence_names = [] 142 self.data = MEMERecord()
143
144 - def _version (self, line):
145 line = line.strip() 146 ls = line.split() 147 self.data.version = ls[2]
148
149 - def _datafile (self, line):
150 line = line.strip() 151 line = line.replace('DATAFILE= ','') 152 self.data.datafile = line
153
154 - def _alphabet (self, line):
155 line = line.strip() 156 line = line.replace('ALPHABET= ','') 157 if line == 'ACGT': 158 al = IUPAC.unambiguous_dna 159 else: 160 al = IUPAC.protein 161 self.data.alphabet = al
162
163 - def _sequence_name (self, line):
164 line = line.strip() 165 ls = line.split() 166 self.data.sequence_names.append(ls[0]) 167 if len(ls) == 6: 168 self.data.sequence_names.append(ls[3])
169
170 - def _commandline (self, line):
171 line = line.strip() 172 line = line.replace('command: ','') 173 self.data.command = line
174
175 - def _add_motif_with_info (self, line):
176 line = line.strip() 177 ls = line.split() 178 motif = Motif.MEMEMotif() 179 motif._length(ls[4]) 180 motif._numoccurrences(ls[7]) 181 motif._evalue(ls[13]) 182 motif._alphabet(self.data.alphabet) 183 self.data.motifs.append(motif) 184 self.current_motif = motif
185
186 - def motif_name (self, line):
187 line = line.strip() 188 ls = line.split() 189 name = join(ls[0:2], ' ') 190 self.current_motif._name(name)
191
192 - def add_instance (self, line):
193 line = line.strip() 194 ls = line.split() 195 if self.data.command.find('revcomp') != -1: 196 seq = Seq.Seq(ls[5], self.data.alphabet) 197 self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[2], pvalue = ls[3], strand = ls[1]) 198 else: 199 seq = Seq.Seq(ls[4], self.data.alphabet) 200 self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[1], pvalue = ls[2])
201
202 - def add_to_pssm (self, line):
203 line = line.strip() 204 sl = line.split() 205 thisposition = tuple([float(i) for i in sl]) 206 self.current_motif.add_to_pssm(thisposition)
207
208 - def add_to_logodds (self, line):
209 line = line.strip() 210 sl = line.split() 211 thisposition = tuple([float(i) for i in sl]) 212 self.current_motif.add_to_logodds(thisposition)
213
214 - def noevent (self,line):
215 pass
216 217 218
219 -class _MASTConsumer:
220 """ 221 Consumer that can receive events from _MASTScanner. 222 223 A _MASTConsumer parses lines from a mast text output file. 224 The motif match diagrams are parsed using line buffering. 225 Each of the buffering functions have a dummy variable that is 226 required for testing using the Bio.ParserSupport.TaggingConsumer. 227 If this variable isn't there, the TaggingConsumer barfs. In 228 the _MASTScanner, None is passed in the place of this variable. 229 """
230 - def __init__ (self):
231 self.data = MASTRecord() 232 self._current_seq = "" 233 self._line_buffer = [] 234 self._buffer_size = 0 235 self._buffered_seq_start = 0
236
237 - def _version (self, line):
238 line = line.strip() 239 ls = line.split() 240 self.data._version(ls[2])
241
242 - def _database (self, line):
243 line = line.strip() 244 ls = line.split() 245 self.data._database(ls[1]) 246 al = "" 247 if ls[2] == '(nucleotide)': 248 al = IUPAC.unambiguous_dna 249 self.data._alphabet(al) 250 else: 251 al = IUPAC.protein 252 self.data._alphabet(al)
253
254 - def _add_motif (self, line):
255 line = line.strip() 256 ls = line.split() 257 m = Motif.MEMEMotif() 258 m._alphabet(self.data.alphabet) 259 m._length(ls[1]) 260 name = ls[0] 261 m._name(name) 262 m._consensus(ls[2]) 263 self.data._add_motif(m)
264
265 - def _add_match_diagram (self, line):
266 line = line.strip() 267 ls = line.split() 268 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 269 ds = ls[1].split('_') 270 i = 0 271 start = 0 272 for i in range(0,len(ds)): 273 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 274 inst = Motif.Instance() 275 inst._seqname (self._current_seq) 276 inst._start (start) 277 r = re.compile('\d+') 278 mn = r.findall(ds[i])[0] 279 if ds[i].find('-') != -1: 280 inst.strand = '-' 281 else: 282 inst.strand = '+' 283 motif = self.data.get_motif_by_name(mn) 284 motif.add_instance(inst) 285 start += motif.length 286 else: 287 start += int(ds[i])
288
289 - def _add_sequence_match_with_diagram (self, line):
290 line = line.strip() 291 ls = line.split() 292 self.data._add_sequence(ls[0]) 293 self.data._add_diagram_for_sequence(ls[2],ls[0]) 294 ds = ls[2].split('_') 295 i = 0 296 start = 0 297 for i in range(0,len(ds)): 298 if ds[i].find('+') != -1 or ds[i].find('-') != -1: 299 inst = Motif.Instance() 300 inst._seqname (ls[0]) 301 inst._start (start) 302 r = re.compile('\d+') 303 mn = r.findall(ds[i])[0] 304 if ds[i].find('-') != -1: 305 inst.strand = '-' 306 else: 307 inst.strand = '+' 308 motif = self.data.get_motif_by_name(mn) 309 motif.add_instance(inst) 310 start += motif.length 311 else: 312 start += int(ds[i])
313
314 - def _add_diagram_from_buffer (self, dummy):
315 line = "" 316 for l in self._line_buffer: 317 line += l.strip() 318 ls = line.split() 319 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 320 ds = ls[1].split('_') 321 i = 0 322 start = 0 323 for i in range(0,len(ds)): 324 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 325 inst = Motif.Instance() 326 inst._seqname (self._current_seq) 327 inst._start (start) 328 r = re.compile('\d+') 329 mn = r.findall(ds[i])[0] 330 if ds[i].find('-') != -1: 331 inst.strand = '-' 332 else: 333 inst.strand = '+' 334 motif = self.data.get_motif_by_name(mn) 335 motif.add_instance(inst) 336 start += motif.length 337 else: 338 start += int(ds[i])
339
340 - def _set_current_seq (self, line):
341 line = line.strip() 342 self._current_seq = line 343 if not self.data.sequences.count(line): 344 self.data.sequences.append(line)
345
346 - def _add_line_to_buffer (self, line):
347 line = line.strip() 348 if not line.startswith('*****'): 349 self._line_buffer.append(line) 350 else: 351 return -1
352
353 - def _parse_buffer (self, dummy):
354 """Parses the line buffer to get e-values for each instance of a motif. 355 This buffer parser is the most likely point of failure for the 356 MASTParser. 357 """ 358 insts = self.data.get_motif_matches_for_sequence(self._current_seq) 359 if len(insts) > 0: 360 361 fullSeq = self._line_buffer[self._buffer_size-1] 362 pvals = self._line_buffer[1].split() 363 p = 0 364 lpval = len(pvals) 365 while p < lpval: 366 if pvals[p].count('e') > 1: 367 #Break blocks up by e and parse into valid floats. This only 368 #works if there are no e-values greater than 1e-5. 369 pvs = [] 370 spe = pvals[p].split('e') 371 spe.reverse() 372 dotind = spe[1].find('.') 373 if dotind == -1: 374 thispval = spe[1][-1] + 'e' + spe[0] 375 else: 376 thispval = spe[1][dotind-1:] + 'e' + spe[0] 377 pvs.append(thispval) 378 for spi in range(2,len(spe)): 379 dotind = spe[spi].find('.') 380 prevdotind = spe[spi-1].find('.') 381 if dotind != -1: 382 if prevdotind == -1: 383 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1] 384 else: 385 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1] 386 else: 387 if prevdotind == -1: 388 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1] 389 else: 390 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1] 391 pvs.append(thispval) 392 pvs.reverse() 393 if p > 0: 394 pvals = pvals[0:p] + pvs + pvals[p+1:] 395 else: 396 pvals = pvs + pvals[p+1:] 397 lpval = len(pvals) 398 p += 1 399 i = 0 400 if len(pvals) != len(insts): 401 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n") 402 pvals = [] 403 # else: 404 # sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n") 405 for i in range(0,len(insts)): 406 inst = insts[i] 407 start = inst.start - self._buffered_seq_start + 1 408 thisSeq = fullSeq[start:start+inst.length] 409 thisSeq = Seq.Seq(thisSeq, self.data.alphabet) 410 inst._sequence(thisSeq) 411 if pvals: 412 inst._pvalue(float(pvals[i]))
413
414 - def _blank_buffer (self, dummy):
415 self._line_buffer = [] 416 self._buffer_size = 0
417
418 - def _collapse_buffer(self, dummy):
419 if self._buffer_size == 0: 420 if len(self._line_buffer) > 0: 421 self._buffer_size = len(self._line_buffer) 422 ll = self._line_buffer[self._buffer_size-1].split() 423 self._line_buffer[self._buffer_size-1] = ll[1] 424 self._buffered_seq_start = int(ll[0]) 425 else: 426 i = 0 427 for i in range(self._buffer_size, len(self._line_buffer)-1): 428 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip() 429 ll = self._line_buffer[len(self._line_buffer)-1].split() 430 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]): 431 self._line_buffer[self._buffer_size-1] += ll[1] 432 else: 433 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1])) 434 self._line_buffer[self._buffer_size-1] += "N"*differ 435 self._line_buffer[self._buffer_size-1] += ll[1] 436 self._line_buffer = self._line_buffer[0:self._buffer_size]
437
438 - def _add_motif_match (self, line):
439 line = line.strip() 440 if line.find('[') != -1 or line.find('<') != -1: 441 pass 442 elif line.find('e') != -1: 443 pass 444 elif line.find('+') != -1: 445 pass
446
447 - def noevent (self, line):
448 pass
449 450 451
452 -class MASTParser(AbstractParser):
453 """ 454 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord 455 456 A MASTParser takes a file handle for a MAST text output file and 457 returns a MASTRecord, containing the hits between motifs and 458 sequences. The parser does some unusual line buffering to parse out 459 match diagrams. Really complex diagrams often lead to an error message 460 and p-values not being parsed for a given line. 461 462 Methods: 463 parse (handle): parses the data from the file handle passed to it. 464 465 Example: 466 467 f = open("mast_file.txt") 468 parser = MASTParser() 469 mast_record = parser.parse(f) 470 for motif in mast_record.motifs: 471 for instance in motif.instances: 472 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 473 """
474 - def __init__ (self):
475 self._consumer = _MASTConsumer() 476 self._scanner = _MASTScanner()
477
478 - def parse (self, handle):
479 self._scanner.feed(handle, self._consumer) 480 return self._consumer.data
481 482 483
484 -class _MASTScanner:
485 """ 486 Scanner for MAST text output. 487 488 """
489 - def feed (self, handle, consumer):
490 if isinstance(handle, File.UndoHandle): 491 uhandle = handle 492 else: 493 uhandle = File.UndoHandle(handle) 494 495 self._scan_header(uhandle, consumer) 496 self._scan_matches(uhandle, consumer) 497 self._scan_annotated_matches(uhandle, consumer)
498
499 - def _scan_header (self, uhandle, consumer):
500 try: 501 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version") 502 except ValueError: 503 raise ValueError, "Improper input file. Does not begin with a line with 'MAST version'" 504 read_and_call(uhandle, consumer._version, contains = 'MAST version') 505 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS') 506 read_and_call(uhandle, consumer.noevent, start = 'DATABASE') 507 read_and_call(uhandle, consumer.noevent, start = '****') 508 read_and_call(uhandle, consumer._database, contains = 'DATABASE') 509 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH') 510 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF') 511 read_and_call(uhandle, consumer.noevent, contains = '----') 512 read_and_call_until(uhandle, consumer._add_motif, blank = 1) 513 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
514
515 - def _scan_matches (self, uhandle, consumer):
516 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 517 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 518 read_and_call(uhandle, consumer.noevent, start = '---') 519 # read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1) 520 read_and_call_until(uhandle, consumer.noevent, blank = 1) 521 read_and_call(uhandle, consumer.noevent, blank = 1)
522
523 - def _scan_annotated_matches (self, uhandle, consumer):
524 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:') 525 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:') 526 read_and_call_until(uhandle, consumer.noevent, start = '****') 527 read_and_call(uhandle, consumer.noevent, start = '****') 528 read_and_call_until(uhandle, consumer.noevent, start = '*****') 529 read_and_call(uhandle, consumer.noevent) 530 read_and_call_while(uhandle, consumer.noevent, blank = 1) 531 readMatches = 1 532 while readMatches == 1: 533 if consumer._current_seq: 534 if consumer._buffer_size != 0: 535 consumer._parse_buffer(None) 536 consumer._blank_buffer(None) 537 read_and_call(uhandle, consumer._set_current_seq) 538 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM') 539 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 540 consumer._add_diagram_from_buffer(None) 541 consumer._blank_buffer(None) 542 read_and_call(uhandle, consumer.noevent, blank = 1) 543 while 1: 544 line = safe_peekline(uhandle) 545 if line.startswith('****'): 546 consumer._parse_buffer(None) 547 readMatches = 0 548 break 549 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 550 read_and_call(uhandle, consumer.noevent, blank = 1) 551 consumer._collapse_buffer(None) 552 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1): 553 break 554 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'): 555 consumer._parse_buffer(None) 556 consumer._blank_buffer(None) 557 readMatches = 0 558 break
559 560 561
562 -class MASTRecord:
563 """The class for holding the results from a MAST run. 564 565 A MASTRecord holds data about matches between motifs and sequences. 566 The motifs held by the MASTRecord are objects of the class MEMEMotif. 567 568 Methods: 569 get_motif_matches_for_sequence(sequence_name): returns all of the 570 motif matches within a given sequence. The matches are objects of 571 the class MEME.Motif.Instance 572 get_motif_matches (motif_name): returns all of the matches for a motif 573 in the sequences searched. The matches returned are of class 574 MEME.Motif.Instance 575 get_motif_by_name (motif_name): returns a MEMEMotif with the given 576 name. 577 """
578 - def __init__ (self):
579 self.sequences = [] 580 self.version = "" 581 self.matches = [] 582 self.database = "" 583 self.diagrams = {} 584 self.alphabet = None 585 self.motifs = []
586
587 - def _version (self, version):
588 self.version = version
589
590 - def _alphabet (self, alphabet):
591 if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna: 592 self.alphabet = alphabet 593 else: 594 return -1
595
596 - def _database(self, database):
597 self.database = database
598
599 - def get_motif_matches_for_sequence (self, seq):
600 insts = [] 601 for m in self.motifs: 602 for i in m.instances: 603 if i.sequence_name == seq: 604 insts.append(i) 605 insts.sort(lambda x,y: cmp(x.start, y.start)) 606 return insts
607
608 - def get_motif_matches (self, motif):
609 m = self.get_motif_by_name (motif.name) 610 return m.instances
611
612 - def _add_diagram_for_sequence (self, diagram, seq):
613 self.diagrams[seq] = diagram
614
615 - def _add_match (self, match):
616 self.matches.append(match)
617
618 - def _add_sequence (self, sequence):
620
621 - def _add_motif (self, motif):
622 self.motifs.append(motif)
623
624 - def get_motif_by_name (self, name):
625 for m in self.motifs: 626 if m.name == name: 627 return m
628