1
2
3
4
5 """
6 Parser for (new) ACE files output by PHRAP.
7
8 version 1.3, 05/06/2004
9 Written by Frank Kauff (fkauff@duke.edu) and
10 Cymon J. Cox (cymon@duke.edu)
11
12 Uses the Biopython Parser interface: ParserSupport.py
13
14 Usage:
15
16 There are two ways of reading an ace file: The ACEParser() reads
17 the whole file at once and the RecordParser() reads contig after
18 contig.
19
20 1) Parse whole ace file at once:
21
22 from Bio.Sequencing import Ace
23 aceparser=Ace.ACEParser()
24 acefilerecord=aceparser.parse(open('my_ace_file.ace','r'))
25
26 This gives you:
27 acefilerecord.ncontigs (the number of contigs in the ace file)
28 acefilerecord.nreads (the number of reads in the ace file)
29 acefilerecord.contigs[] (one instance of the Contig class for each contig)
30
31 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
32 for this contig in a list of instances of the Read class, e.g.:
33
34 contig3=acefilerecord.contigs[2]
35 read4=contig3.reads[3]
36 RD_of_read4=read4.rd
37 DS_of_read4=read4.ds
38
39 CT, WA, RT tags from the end of the file can appear anywhere are automatically
40 sorted into the right place.
41
42 see _RecordConsumer for details.
43
44 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
45
46 from Bio.Sequencing import Ace
47 recordparser=Ace.RecordParser()
48 iterator=Ace.Iterator(open('my_ace_file.ace','r'),recordparser)
49 for contig in iterator :
50 print contig.name
51 ...
52
53 Please note that for memory efficiency, when using the iterator approach, only one
54 contig is kept in memory at once. However, there can be a footer to the ACE file
55 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
56 Because the parser doesn't see this data until the final record, it cannot be added to
57 the appropriate records. Instead these tags will be returned with the last contig record.
58 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
59 are needed, the ACEParser instead of the RecordParser might be appropriate.
60 """
61 import os
62 from Bio import File
63 from Bio.ParserSupport import *
64 from Bio.Alphabet import IUPAC
65
66
68 """RD (reads), store a read with its name, sequence etc."""
70 self.name=''
71 self.padded_bases=None
72 self.info_items=None
73 self.read_tags=None
74 self.sequence=''
75
77 """QA (read quality), including which part if any was used as the consensus."""
79 self.qual_clipping_start=None
80 self.qual_clipping_end=None
81 self.align_clipping_start=None
82 self.align_clipping_end=None
83
85 """DS lines, include file name of a read's chromatogram file."""
87 self.chromat_file=''
88 self.phd_file=''
89 self.time=''
90 self.chem=''
91 self.dye=''
92 self.template=''
93 self.direction=''
94
96 """AF lines, define the location of the read within the contig."""
98 self.name=''
99 self.coru=None
100 self.padded_start=None
101
103 """"BS (base segment), which read was chosen as the consensus at each position."""
105 self.name=''
106 self.padded_start=None
107 self.padded_end=None
108
110 """RT (transient read tags), generated by crossmatch and phrap."""
112 self.name=''
113 self.tag_type=''
114 self.program=''
115 self.padded_start=None
116 self.padded_end=None
117 self.date=''
118
120 """CT (consensus tags)."""
122 self.name=''
123 self.tag_type=''
124 self.program=''
125 self.padded_start=None
126 self.padded_end=None
127 self.date=''
128 self.notrans=''
129 self.info=[]
130
132 """WA (whole assembly tag), holds the assembly program name, version, etc."""
134 self.tag_type=''
135 self.program=''
136 self.date=''
137 self.info=[]
138
140 """WR lines."""
142 self.name=''
143 self.aligned=''
144 self.program=''
145 self.date=[]
146
148 """Holds information about a read supporting an ACE contig."""
150 self.rd=None
151 self.qa=None
152 self.ds=None
153 self.rt=None
154 self.wr=None
155
157 """Holds information about a contig from an ACE record."""
159 self.name = ''
160 self.nbases=None
161 self.nreads=None
162 self.nsegments=None
163 self.uorc=None
164 self.sequence=None
165 self.quality=None
166 self.af=[]
167 self.bs=[]
168 self.reads=[]
169 self.ct=None
170 self.wa=None
171
173 """Iterates over an ACE-file with multiple contigs.
174
175 Methods:
176 next Return the next record from the stream, or None.
177 """
178
179 - def __init__(self, handle, parser=None):
180 """__init__(self, handle, parser=None)
181
182 Create a new iterator. handle is a file-like object. parser
183 is an optional Parser object to change the results into another form.
184 If set to None, then the raw contents of the file will be returned.
185 """
186 self._uhandle = File.UndoHandle(handle)
187 self._parser = parser
188
190 """next(self) -> object
191
192 Return the next contig record from the file. If no more records
193 return None.
194 """
195
196 lines = []
197 while 1:
198
199 line=self._uhandle.readline()
200 if not line:
201 return None
202 if line[:2]=='CO':
203 lines.append(line)
204 break
205 while 1:
206 line = self._uhandle.readline()
207 if not line:
208 break
209
210 if lines and line[:2] == 'CO':
211 self._uhandle.saveline(line)
212 break
213 lines.append(line)
214
215 if not lines:
216 return None
217
218 data = ''.join(lines)
219 if self._parser is not None:
220 return self._parser.parse(File.StringHandle(data))
221 return data
222
224 """Iterate over the ACE file record by record."""
225 return iter(self.next, None)
226
228 """Parses ACE file data into a Record object."""
232
233 - def parse(self, handle):
240
242 """Holds data of an ACE file.
243 """
245 self.ncontigs=None
246 self.nreads=None
247 self.contigs=[]
248 self.wa=None
249
251 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
252
253 ct=[]
254 rt=[]
255 wr=[]
256
257 for i in range(len(self.contigs)):
258 c = self.contigs[i]
259 if c.wa:
260 if not self.wa:
261 self.wa=[]
262 self.wa.extend(c.wa)
263 if c.ct:
264 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
265 map(self.contigs[i].ct.remove,newcts)
266 ct.extend(newcts)
267 for j in range(len(c.reads)):
268 r = c.reads[j]
269 if r.rt:
270 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
271 map(self.contigs[i].reads[j].rt.remove,newrts)
272 rt.extend(newrts)
273 if r.wr:
274 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
275 map(self.contigs[i].reads[j].wr.remove,newwrs)
276 wr.extend(newwrs)
277
278 for i in range(len(self.contigs)):
279 c = self.contigs[i]
280 for ct_tag in ct:
281 if ct_tag.name==c.name:
282 if self.contigs[i].ct is None:
283 self.contigs[i].ct=[]
284 self.contigs[i].ct.append(ct_tag)
285 if rt or wr:
286 for j in range(len(c.reads)):
287 r = c.reads[j]
288 for rt_tag in rt:
289 if rt_tag.name==r.rd.name:
290 if self.contigs[i].reads[j].rt is None:
291 self.contigs[i].reads[j].rt=[]
292 self.contigs[i].reads[j].rt.append(rt_tag)
293 for wr_tag in wr:
294 if wr_tag.name==r.rd.name:
295 if self.contigs[i].reads[j].wr is None:
296 self.contigs[i].reads[j].wr=[]
297 self.contigs[i].reads[j].wr.append(wr_tag)
298
300 """Parses full ACE file in list of contigs.
301
302 """
303
306
308 firstline=handle.readline()
309
310 if firstline[:2]!='AS':
311 raise ValueError, "File does not start with 'AS'."
312 self.data.ncontigs=eval(firstline.split()[1])
313 self.data.nreads=eval(firstline.split()[2])
314
315 recparser=RecordParser()
316 iter=Iterator(handle,recparser)
317 while 1:
318 rec=iter.next()
319 if not rec:
320 break
321 self.data.contigs.append(rec)
322
323
324
325
326
327
328 self.data.sort()
329 return self.data
330
332 """Scans an ACE-formatted file.
333
334 Methods:
335 feed - Feed one ACE record.
336 """
337 - def feed(self, handle, consumer):
338 """feed(self, handle, consumer)
339
340 Feed in ACE data for scanning. handle is a file-like object
341 containing ACE data. consumer is a Consumer object that will
342 receive events as the ACE data is scanned.
343 """
344 assert isinstance(handle, File.UndoHandle), \
345 "handle must be an UndoHandle"
346 if handle.peekline():
347 self._scan_record(handle, consumer)
348
350 consumer.begin_contig()
351 read_and_call(uhandle,consumer.co_header,start='CO ')
352 consumer.co_data(self._scan_sequence_data(uhandle))
353 read_and_call_while(uhandle,consumer.noevent,blank=1)
354 read_and_call(uhandle,consumer.bq_header,start='BQ')
355 consumer.bq_data(self._scan_bq_data(uhandle, consumer))
356 read_and_call_while(uhandle,consumer.noevent,blank=1)
357 read_and_call_while(uhandle,consumer.af,start='AF ')
358 read_and_call_while(uhandle,consumer.noevent,blank=1)
359 read_and_call_while(uhandle,consumer.bs,start='BS ')
360
361
362
363
364
365
366 while 1:
367
368 read_and_call_until(uhandle,consumer.noevent,start='RD ')
369 read_and_call(uhandle,consumer.rd_header,start='RD ')
370 consumer.rd_data(self._scan_sequence_data(uhandle))
371 read_and_call_while(uhandle,consumer.noevent,blank=1)
372 read_and_call(uhandle,consumer.qa,start='QA ')
373
374 try:
375 read_and_call_while(uhandle,consumer.noevent,blank=1)
376 attempt_read_and_call(uhandle,consumer.ds,start='DS ')
377 except ValueError:
378
379 consumer.end_contig()
380 return
381
382
383 while 1:
384
385 try:
386 read_and_call_while(uhandle,consumer.noevent,blank=1)
387 except ValueError:
388
389 consumer.end_contig()
390 return
391 else:
392 if attempt_read_and_call(uhandle,consumer.rt_start,start='RT'):
393 consumer.rt_data(self._scan_bracket_tags(uhandle))
394 elif attempt_read_and_call(uhandle,consumer.wr_start,start='WR'):
395 consumer.wr_data(self._scan_bracket_tags(uhandle))
396 elif attempt_read_and_call(uhandle,consumer.wa_start,start='WA'):
397 consumer.wa_data(self._scan_bracket_tags(uhandle))
398 elif attempt_read_and_call(uhandle,consumer.ct_start,start='CT'):
399 consumer.ct_data(self._scan_bracket_tags(uhandle))
400 else:
401 line=safe_peekline(uhandle)
402 break
403 if not line.startswith('RD'):
404 consumer.end_contig()
405 break
406
408 """Scans multiple lines of quality data and concatenates them."""
409
410 qual=''
411 while 1:
412 line=uhandle.readline()
413 if is_blank_line(line):
414 uhandle.saveline(line)
415 break
416 qual+=' '+line
417 return qual
418
420 """Scans multiple lines of sequence data and concatenates them."""
421
422 seq=''
423 while 1:
424 line=uhandle.readline()
425 if is_blank_line(line):
426 uhandle.saveline(line)
427 break
428 seq+=line.strip()
429 return seq
430
444
446 """Reads the ace tags into data records."""
447
450
453
456
464
467
470
473
481
489
500
503
505 header=map(eval,line.split()[1:])
506 qadata=qa()
507 qadata.qual_clipping_start=header[0]
508 qadata.qual_clipping_end=header[1]
509 qadata.align_clipping_start=header[2]
510 qadata.align_clipping_end=header[3]
511 self.data.reads[-1].qa=qadata
512
514 dsdata=ds()
515 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
516 poss=map(line.find,tags)
517 tagpos=dict(zip(poss,tags))
518 if tagpos.has_key(-1):
519 del tagpos[-1]
520 ps=tagpos.keys()
521 ps.sort()
522 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
523 setattr(dsdata,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
524 self.data.reads[-1].ds=dsdata
525
527 if not line.strip().endswith('{'):
528 print line
529 raise ValueError, 'CT tag does not start with CT{'
530 ctdata=ct()
531 if self.data.ct is None:
532 self.data.ct=[]
533 self.data.ct.append(ctdata)
534
548
550 if not line.strip().endswith('{'):
551 raise ValueError, 'RT tag does not start with RT{'
552 rtdata=rt()
553
554
555 if self.data.reads[-1].rt is None:
556 self.data.reads[-1].rt=[]
557 self.data.reads[-1].rt.append(rtdata)
558
569
571 if not line.strip().endswith('{'):
572 raise ValueError, 'WA tag does not start with WA{'
573 wadata=wa()
574 if self.data.wa is None:
575 self.data.wa=[]
576 self.data.wa.append(wadata)
577
586
588 if not line.strip().endswith('{'):
589 raise ValueError, 'WR tag does not start with WR{'
590 wrdata=wr()
591 if self.data.reads[-1].wr is None:
592 self.data.reads[-1].wr=[]
593 self.data.reads[-1].wr.append(wrdata)
594
603
604 if __name__ == "__main__" :
605 print "Quick self test"
606
607 handle = open("../../Tests/Ace/contig1.ace")
608 recordparser = RecordParser()
609 iterator = Iterator(handle,recordparser)
610 for contig in iterator :
611 print contig.name, len(contig.sequence)
612 handle.close()
613 print "Done"
614