1
2
3
4
5 """
6 Parser for ACE files output by PHRAP.
7
8 Written by Frank Kauff (fkauff@duke.edu) and
9 Cymon J. Cox (cymon@duke.edu)
10
11 Uses the Biopython Parser interface: ParserSupport.py
12
13 Usage:
14
15 There are two ways of reading an ace file:
16 1) The function 'read' reads the whole file at once;
17 2) The function 'parse' reads the file contig after contig.
18
19 1) Parse whole ace file at once:
20
21 from Bio.Sequencing import Ace
22 acefilerecord=Ace.read(open('my_ace_file.ace'))
23
24 This gives you:
25 acefilerecord.ncontigs (the number of contigs in the ace file)
26 acefilerecord.nreads (the number of reads in the ace file)
27 acefilerecord.contigs[] (one instance of the Contig class for each contig)
28
29 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
30 for this contig in a list of instances of the Read class, e.g.:
31
32 contig3=acefilerecord.contigs[2]
33 read4=contig3.reads[3]
34 RD_of_read4=read4.rd
35 DS_of_read4=read4.ds
36
37 CT, WA, RT tags from the end of the file can appear anywhere are automatically
38 sorted into the right place.
39
40 see _RecordConsumer for details.
41
42 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
43
44 from Bio.Sequencing import Ace
45 contigs=Ace.parse(open('my_ace_file.ace'))
46 for contig in contigs:
47 print contig.name
48 ...
49
50 Please note that for memory efficiency, when using the iterator approach, only one
51 contig is kept in memory at once. However, there can be a footer to the ACE file
52 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
53 Because the parser doesn't see this data until the final record, it cannot be added to
54 the appropriate records. Instead these tags will be returned with the last contig record.
55 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
56 are needed, the 'read' function rather than the 'parse' function might be more appropriate.
57 """
58
59
61 """RD (reads), store a read with its name, sequence etc."""
63 self.name=''
64 self.padded_bases=None
65 self.info_items=None
66 self.read_tags=None
67 self.sequence=''
68
70 """QA (read quality), including which part if any was used as the consensus."""
72 self.qual_clipping_start=None
73 self.qual_clipping_end=None
74 self.align_clipping_start=None
75 self.align_clipping_end=None
76 if line:
77 header=map(eval,line.split()[1:])
78 self.qual_clipping_start=header[0]
79 self.qual_clipping_end=header[1]
80 self.align_clipping_start=header[2]
81 self.align_clipping_end=header[3]
82
84 """DS lines, include file name of a read's chromatogram file."""
86 self.chromat_file=''
87 self.phd_file=''
88 self.time=''
89 self.chem=''
90 self.dye=''
91 self.template=''
92 self.direction=''
93 if line:
94 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
95 poss=map(line.find,tags)
96 tagpos=dict(zip(poss,tags))
97 if -1 in tagpos:
98 del tagpos[-1]
99 ps=tagpos.keys()
100 ps.sort()
101 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
102 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
103
104
106 """AF lines, define the location of the read within the contig."""
108 self.name=''
109 self.coru=None
110 self.padded_start=None
111 if line:
112 header = line.split()
113 self.name = header[1]
114 self.coru = header[2]
115 self.padded_start = int(header[3])
116
118 """"BS (base segment), which read was chosen as the consensus at each position."""
120 self.name=''
121 self.padded_start=None
122 self.padded_end=None
123 if line:
124 header = line.split()
125 self.padded_start = int(header[1])
126 self.padded_end = int(header[2])
127 self.name = header[3]
128
130 """RT (transient read tags), generated by crossmatch and phrap."""
132 self.name=''
133 self.tag_type=''
134 self.program=''
135 self.padded_start=None
136 self.padded_end=None
137 self.date=''
138 if line:
139 header=line.split()
140 self.name=header[0]
141 self.tag_type=header[1]
142 self.program=header[2]
143 self.padded_start=int(header[3])
144 self.padded_end=int(header[4])
145 self.date=header[5]
146
148 """CT (consensus tags)."""
150 self.name=''
151 self.tag_type=''
152 self.program=''
153 self.padded_start=None
154 self.padded_end=None
155 self.date=''
156 self.notrans=''
157 self.info=[]
158 self.comment=[]
159 if line:
160 header=line.split()
161 self.name = header[0]
162 self.tag_type = header[1]
163 self.program = header[2]
164 self.padded_start = int(header[3])
165 self.padded_end = int(header[4])
166 self.date = header[5]
167 if len(header)==7:
168 self.notrans = header[6]
169
171 """WA (whole assembly tag), holds the assembly program name, version, etc."""
173 self.tag_type=''
174 self.program=''
175 self.date=''
176 self.info=[]
177 if line:
178 header = line.split()
179 self.tag_type = header[0]
180 self.program = header[1]
181 self.date = header[2]
182
196
198 """Holds information about a read supporting an ACE contig."""
212
214 """Holds information about a contig from an ACE record."""
216 self.name = ''
217 self.nbases=None
218 self.nreads=None
219 self.nsegments=None
220 self.uorc=None
221 self.sequence=""
222 self.quality=[]
223 self.af=[]
224 self.bs=[]
225 self.reads=[]
226 self.ct=None
227 self.wa=None
228 if line:
229 header = line.split()
230 self.name = header[1]
231 self.nbases = int(header[2])
232 self.nreads = int(header[3])
233 self.nsegments = int(header[4])
234 self.uorc = header[5]
235
237 """parse(handle)
238
239 where handle is a file-like object.
240
241 This function returns an iterator that allows you to iterate
242 over the ACE file record by record:
243
244 records = parse(handle)
245 for record in records:
246 # do something with the record
247
248 where each record is a Contig object.
249 """
250
251 handle = iter(handle)
252
253 line = ""
254 while True:
255
256 try:
257 while True:
258 if line.startswith('CO'):
259 break
260 line = handle.next()
261 except StopIteration:
262 return
263
264 record = Contig(line)
265
266 for line in handle:
267 line = line.strip()
268 if not line:
269 break
270 record.sequence+=line
271
272 for line in handle:
273 if line.strip():
274 break
275 if not line.startswith("BQ"):
276 raise ValueError("Failed to find BQ line")
277
278 for line in handle:
279 if not line.strip():
280 break
281 record.quality.extend(map(int,line.split()))
282
283 for line in handle:
284 if line.strip():
285 break
286
287 while True:
288 if not line.startswith("AF "):
289 break
290 record.af.append(af(line))
291 try:
292 line = handle.next()
293 except StopIteration:
294 raise ValueError("Unexpected end of AF block")
295
296 while True:
297 if line.strip():
298 break
299 try:
300 line = handle.next()
301 except StopIteration:
302 raise ValueError("Unexpected end of file")
303
304 while True:
305 if not line.startswith("BS "):
306 break
307 record.bs.append(bs(line))
308 try:
309 line = handle.next()
310 except StopIteration:
311 raise ValueError("Failed to find end of BS block")
312
313
314
315
316
317
318
319 while True:
320
321
322 try:
323 while True:
324
325 if line.startswith("RD "):
326 break
327 line = handle.next()
328 except StopIteration:
329 raise ValueError("Failed to find RD line")
330
331 record.reads.append(Reads(line))
332
333 for line in handle:
334 line = line.strip()
335 if not line:
336 break
337 record.reads[-1].rd.sequence+=line
338
339 for line in handle:
340 if line.strip():
341 break
342 if not line.startswith("QA "):
343 raise ValueError("Failed to find QA line")
344 record.reads[-1].qa = qa(line)
345
346
347 for line in handle:
348 if line.strip():
349 break
350 else:
351 break
352
353 if line.startswith("DS "):
354 record.reads[-1].ds = ds(line)
355 line = ""
356
357
358 while True:
359
360 try:
361 while True:
362 if line.strip():
363 break
364 line = handle.next()
365 except StopIteration:
366
367 break
368 if line.startswith("RT{"):
369
370
371
372 if record.reads[-1].rt is None:
373 record.reads[-1].rt=[]
374 for line in handle:
375 line=line.strip()
376 if line=='}': break
377 record.reads[-1].rt.append(rt(line))
378 line = ""
379 elif line.startswith("WR{"):
380 if record.reads[-1].wr is None:
381 record.reads[-1].wr=[]
382 for line in handle:
383 line=line.strip()
384 if line=='}': break
385 record.reads[-1].wr.append(wr(line))
386 line = ""
387 elif line.startswith("WA{"):
388 if record.wa is None:
389 record.wa=[]
390 try:
391 line = handle.next()
392 except StopIteration:
393 raise ValueError("Failed to read WA block")
394 record.wa.append(wa(line))
395 for line in handle:
396 line=line.strip()
397 if line=='}': break
398 record.wa[-1].info.append(line)
399 line = ""
400 elif line.startswith("CT{"):
401 if record.ct is None:
402 record.ct=[]
403 try:
404 line = handle.next()
405 except StopIteration:
406 raise ValueError("Failed to read CT block")
407 record.ct.append(ct(line))
408 for line in handle:
409 line=line.strip()
410 if line=="COMMENT{":
411 for line in handle:
412 line = line.strip()
413 if line.endswith("C}"):
414 break
415 record.ct[-1].comment.append(line)
416 elif line=='}':
417 break
418 else:
419 record.ct[-1].info.append(line)
420 line = ""
421 else:
422 break
423
424 if not line.startswith('RD'):
425 break
426
427 yield record
428
430 """Holds data of an ACE file.
431 """
433 self.ncontigs=None
434 self.nreads=None
435 self.contigs=[]
436 self.wa=None
437
439 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
440
441 ct=[]
442 rt=[]
443 wr=[]
444
445 for i in range(len(self.contigs)):
446 c = self.contigs[i]
447 if c.wa:
448 if not self.wa:
449 self.wa=[]
450 self.wa.extend(c.wa)
451 if c.ct:
452 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
453 map(self.contigs[i].ct.remove,newcts)
454 ct.extend(newcts)
455 for j in range(len(c.reads)):
456 r = c.reads[j]
457 if r.rt:
458 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
459 map(self.contigs[i].reads[j].rt.remove,newrts)
460 rt.extend(newrts)
461 if r.wr:
462 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
463 map(self.contigs[i].reads[j].wr.remove,newwrs)
464 wr.extend(newwrs)
465
466 for i in range(len(self.contigs)):
467 c = self.contigs[i]
468 for ct_tag in ct:
469 if ct_tag.name==c.name:
470 if self.contigs[i].ct is None:
471 self.contigs[i].ct=[]
472 self.contigs[i].ct.append(ct_tag)
473 if rt or wr:
474 for j in range(len(c.reads)):
475 r = c.reads[j]
476 for rt_tag in rt:
477 if rt_tag.name==r.rd.name:
478 if self.contigs[i].reads[j].rt is None:
479 self.contigs[i].reads[j].rt=[]
480 self.contigs[i].reads[j].rt.append(rt_tag)
481 for wr_tag in wr:
482 if wr_tag.name==r.rd.name:
483 if self.contigs[i].reads[j].wr is None:
484 self.contigs[i].reads[j].wr=[]
485 self.contigs[i].reads[j].wr.append(wr_tag)
486
518
519
520
521 from Bio import File
522 from Bio.ParserSupport import *
523
525 """Iterates over an ACE-file with multiple contigs.
526
527 Methods:
528 next Return the next record from the stream, or None.
529 """
530
531 - def __init__(self, handle, parser=None):
532 """__init__(self, handle, parser=None)
533
534 Create a new iterator. handle is a file-like object. parser
535 is an optional Parser object to change the results into another form.
536 If set to None, then the raw contents of the file will be returned.
537 """
538 import warnings
539 warnings.warn("Ace.Iterator is deprecated. Instead of Ace.Iterator(handle, Ace.RecordParser()), please use Ace.parse(handle)", DeprecationWarning)
540 self._uhandle = File.UndoHandle(handle)
541 self._parser = parser
542
544 """next(self) -> object
545
546 Return the next contig record from the file. If no more records
547 return None.
548 """
549
550 lines = []
551 while 1:
552
553 line=self._uhandle.readline()
554 if not line:
555 return None
556 if line[:2]=='CO':
557 lines.append(line)
558 break
559 while 1:
560 line = self._uhandle.readline()
561 if not line:
562 break
563
564 if lines and line[:2] == 'CO':
565 self._uhandle.saveline(line)
566 break
567 lines.append(line)
568
569 if not lines:
570 return None
571
572 data = ''.join(lines)
573 if self._parser is not None:
574 return self._parser.parse(File.StringHandle(data))
575 return data
576
578 """Iterate over the ACE file record by record."""
579 return iter(self.next, None)
580
582 """Parses ACE file data into a Record object."""
586
587 - def parse(self, handle):
594
596 """Parses full ACE file in list of contigs.
597
598 """
599
601 import warnings
602 warnings.warn("Ace.ACEParser is deprecated. Instead of Ace.ACEParser().parse(handle), please use Ace.read(handle)", DeprecationWarning)
603 self.data=ACEFileRecord()
604
606 firstline=handle.readline()
607
608 if firstline[:2]!='AS':
609 raise ValueError("File does not start with 'AS'.")
610 self.data.ncontigs=eval(firstline.split()[1])
611 self.data.nreads=eval(firstline.split()[2])
612
613 records = parse(handle)
614 self.data.contigs = list(records)
615
616
617
618
619
620
621 self.data.sort()
622 return self.data
623
625 """Scans an ACE-formatted file.
626
627 Methods:
628 feed - Feed one ACE record.
629 """
630 - def feed(self, handle, consumer):
631 """feed(self, handle, consumer)
632
633 Feed in ACE data for scanning. handle is a file-like object
634 containing ACE data. consumer is a Consumer object that will
635 receive events as the ACE data is scanned.
636 """
637 assert isinstance(handle, File.UndoHandle), \
638 "handle must be an UndoHandle"
639 if handle.peekline():
640 self._scan_record(handle, consumer)
641
643 consumer.begin_contig()
644 read_and_call(uhandle,consumer.co_header,start='CO ')
645 consumer.co_data(self._scan_sequence_data(uhandle))
646 read_and_call_while(uhandle,consumer.noevent,blank=1)
647 read_and_call(uhandle,consumer.bq_header,start='BQ')
648 consumer.bq_data(self._scan_bq_data(uhandle, consumer))
649 read_and_call_while(uhandle,consumer.noevent,blank=1)
650 read_and_call_while(uhandle,consumer.af,start='AF ')
651 read_and_call_while(uhandle,consumer.noevent,blank=1)
652 read_and_call_while(uhandle,consumer.bs,start='BS ')
653
654
655
656
657
658
659 while 1:
660
661 read_and_call_until(uhandle,consumer.noevent,start='RD ')
662 read_and_call(uhandle,consumer.rd_header,start='RD ')
663 consumer.rd_data(self._scan_sequence_data(uhandle))
664 read_and_call_while(uhandle,consumer.noevent,blank=1)
665 read_and_call(uhandle,consumer.qa,start='QA ')
666
667 try:
668 read_and_call_while(uhandle,consumer.noevent,blank=1)
669 attempt_read_and_call(uhandle,consumer.ds,start='DS ')
670 except ValueError:
671
672 consumer.end_contig()
673 return
674
675
676 while 1:
677
678 try:
679 read_and_call_while(uhandle,consumer.noevent,blank=1)
680 except ValueError:
681
682 consumer.end_contig()
683 return
684 else:
685 if attempt_read_and_call(uhandle,consumer.rt_start,start='RT'):
686 consumer.rt_data(self._scan_bracket_tags(uhandle))
687 elif attempt_read_and_call(uhandle,consumer.wr_start,start='WR'):
688 consumer.wr_data(self._scan_bracket_tags(uhandle))
689 elif attempt_read_and_call(uhandle,consumer.wa_start,start='WA'):
690 consumer.wa_data(self._scan_bracket_tags(uhandle))
691 elif attempt_read_and_call(uhandle,consumer.ct_start,start='CT'):
692 consumer.ct_data(self._scan_bracket_tags(uhandle))
693 else:
694 line=safe_peekline(uhandle)
695 break
696 if not line.startswith('RD'):
697 consumer.end_contig()
698 break
699
701 """Scans multiple lines of quality data and concatenates them."""
702
703 qual=''
704 while 1:
705 line=uhandle.readline()
706 if is_blank_line(line):
707 uhandle.saveline(line)
708 break
709 qual+=' '+line
710 return qual
711
713 """Scans multiple lines of sequence data and concatenates them."""
714
715 seq=''
716 while 1:
717 line=uhandle.readline()
718 if is_blank_line(line):
719 uhandle.saveline(line)
720 break
721 seq+=line.strip()
722 return seq
723
737
739 """Reads the ace tags into data records."""
740
743
746
749
757
760
763
766
774
782
793
796
798 header=map(eval,line.split()[1:])
799 qadata=qa()
800 qadata.qual_clipping_start=header[0]
801 qadata.qual_clipping_end=header[1]
802 qadata.align_clipping_start=header[2]
803 qadata.align_clipping_end=header[3]
804 self.data.reads[-1].qa=qadata
805
807 dsdata=ds()
808 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
809 poss=map(line.find,tags)
810 tagpos=dict(zip(poss,tags))
811 if -1 in tagpos:
812 del tagpos[-1]
813 ps=tagpos.keys()
814 ps.sort()
815 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
816 setattr(dsdata,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
817 self.data.reads[-1].ds=dsdata
818
826
840
842 if not line.strip().endswith('{'):
843 raise ValueError('RT tag does not start with RT{')
844 rtdata=rt()
845
846
847 if self.data.reads[-1].rt is None:
848 self.data.reads[-1].rt=[]
849 self.data.reads[-1].rt.append(rtdata)
850
861
869
878
880 if not line.strip().endswith('{'):
881 raise ValueError('WR tag does not start with WR{')
882 wrdata=wr()
883 if self.data.reads[-1].wr is None:
884 self.data.reads[-1].wr=[]
885 self.data.reads[-1].wr.append(wrdata)
886
895
896 if __name__ == "__main__" :
897 print "Quick self test"
898
899 handle = open("../../Tests/Ace/contig1.ace")
900 contigs = parse(handle)
901 for contig in contigs:
902 print contig.name, len(contig.sequence), len(contig.reads)
903 handle.close()
904 print "Done"
905