1
2
3
4
5 """
6 Parser for ACE files output by PHRAP.
7
8 Written by Frank Kauff (fkauff@duke.edu) and
9 Cymon J. Cox (cymon@duke.edu)
10
11 Uses the Biopython Parser interface: ParserSupport.py
12
13 Usage:
14
15 There are two ways of reading an ace file:
16 1) The function 'read' reads the whole file at once;
17 2) The function 'parse' reads the file contig after contig.
18
19 1) Parse whole ace file at once:
20
21 from Bio.Sequencing import Ace
22 acefilerecord=Ace.read(open('my_ace_file.ace'))
23
24 This gives you:
25 acefilerecord.ncontigs (the number of contigs in the ace file)
26 acefilerecord.nreads (the number of reads in the ace file)
27 acefilerecord.contigs[] (one instance of the Contig class for each contig)
28
29 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
30 for this contig in a list of instances of the Read class, e.g.:
31
32 contig3=acefilerecord.contigs[2]
33 read4=contig3.reads[3]
34 RD_of_read4=read4.rd
35 DS_of_read4=read4.ds
36
37 CT, WA, RT tags from the end of the file can appear anywhere are automatically
38 sorted into the right place.
39
40 see _RecordConsumer for details.
41
42 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
43
44 from Bio.Sequencing import Ace
45 contigs=Ace.parse(open('my_ace_file.ace'))
46 for contig in contigs:
47 print contig.name
48 ...
49
50 Please note that for memory efficiency, when using the iterator approach, only one
51 contig is kept in memory at once. However, there can be a footer to the ACE file
52 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
53 Because the parser doesn't see this data until the final record, it cannot be added to
54 the appropriate records. Instead these tags will be returned with the last contig record.
55 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
56 are needed, the 'read' function rather than the 'parse' function might be more appropriate.
57 """
58
59
61 """RD (reads), store a read with its name, sequence etc."""
63 self.name=''
64 self.padded_bases=None
65 self.info_items=None
66 self.read_tags=None
67 self.sequence=''
68
70 """QA (read quality), including which part if any was used as the consensus."""
72 self.qual_clipping_start=None
73 self.qual_clipping_end=None
74 self.align_clipping_start=None
75 self.align_clipping_end=None
76 if line:
77 header=map(eval,line.split()[1:])
78 self.qual_clipping_start=header[0]
79 self.qual_clipping_end=header[1]
80 self.align_clipping_start=header[2]
81 self.align_clipping_end=header[3]
82
84 """DS lines, include file name of a read's chromatogram file."""
86 self.chromat_file=''
87 self.phd_file=''
88 self.time=''
89 self.chem=''
90 self.dye=''
91 self.template=''
92 self.direction=''
93 if line:
94 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
95 poss=map(line.find,tags)
96 tagpos=dict(zip(poss,tags))
97 if -1 in tagpos:
98 del tagpos[-1]
99 ps=tagpos.keys()
100 ps.sort()
101 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
102 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
103
104
106 """AF lines, define the location of the read within the contig."""
108 self.name=''
109 self.coru=None
110 self.padded_start=None
111 if line:
112 header = line.split()
113 self.name = header[1]
114 self.coru = header[2]
115 self.padded_start = int(header[3])
116
118 """"BS (base segment), which read was chosen as the consensus at each position."""
120 self.name=''
121 self.padded_start=None
122 self.padded_end=None
123 if line:
124 header = line.split()
125 self.padded_start = int(header[1])
126 self.padded_end = int(header[2])
127 self.name = header[3]
128
130 """RT (transient read tags), generated by crossmatch and phrap."""
132 self.name=''
133 self.tag_type=''
134 self.program=''
135 self.padded_start=None
136 self.padded_end=None
137 self.date=''
138 if line:
139 header=line.split()
140 self.name=header[0]
141 self.tag_type=header[1]
142 self.program=header[2]
143 self.padded_start=int(header[3])
144 self.padded_end=int(header[4])
145 self.date=header[5]
146
148 """CT (consensus tags)."""
150 self.name=''
151 self.tag_type=''
152 self.program=''
153 self.padded_start=None
154 self.padded_end=None
155 self.date=''
156 self.notrans=''
157 self.info=[]
158 self.comment=[]
159 if line:
160 header=line.split()
161 self.name = header[0]
162 self.tag_type = header[1]
163 self.program = header[2]
164 self.padded_start = int(header[3])
165 self.padded_end = int(header[4])
166 self.date = header[5]
167 if len(header)==7:
168 self.notrans = header[6]
169
171 """WA (whole assembly tag), holds the assembly program name, version, etc."""
173 self.tag_type=''
174 self.program=''
175 self.date=''
176 self.info=[]
177 if line:
178 header = line.split()
179 self.tag_type = header[0]
180 self.program = header[1]
181 self.date = header[2]
182
196
198 """Holds information about a read supporting an ACE contig."""
212
214 """Holds information about a contig from an ACE record."""
216 self.name = ''
217 self.nbases=None
218 self.nreads=None
219 self.nsegments=None
220 self.uorc=None
221 self.sequence=""
222 self.quality=[]
223 self.af=[]
224 self.bs=[]
225 self.reads=[]
226 self.ct=None
227 self.wa=None
228 if line:
229 header = line.split()
230 self.name = header[1]
231 self.nbases = int(header[2])
232 self.nreads = int(header[3])
233 self.nsegments = int(header[4])
234 self.uorc = header[5]
235
237 """parse(handle)
238
239 where handle is a file-like object.
240
241 This function returns an iterator that allows you to iterate
242 over the ACE file record by record:
243
244 records = parse(handle)
245 for record in records:
246 # do something with the record
247
248 where each record is a Contig object.
249 """
250
251 handle = iter(handle)
252
253 line = ""
254 while True:
255
256 try:
257 while True:
258 if line.startswith('CO'):
259 break
260 line = handle.next()
261 except StopIteration:
262 return
263
264 record = Contig(line)
265
266 for line in handle:
267 line = line.strip()
268 if not line:
269 break
270 record.sequence+=line
271
272 for line in handle:
273 if line.strip():
274 break
275 if not line.startswith("BQ"):
276 raise ValueError("Failed to find BQ line")
277
278 for line in handle:
279 if not line.strip():
280 break
281 record.quality.extend(map(int,line.split()))
282
283 for line in handle:
284 if line.strip():
285 break
286
287 while True:
288 if not line.startswith("AF "):
289 break
290 record.af.append(af(line))
291 try:
292 line = handle.next()
293 except StopIteration:
294 raise ValueError("Unexpected end of AF block")
295
296 while True:
297 if line.strip():
298 break
299 try:
300 line = handle.next()
301 except StopIteration:
302 raise ValueError("Unexpected end of file")
303
304 while True:
305 if not line.startswith("BS "):
306 break
307 record.bs.append(bs(line))
308 try:
309 line = handle.next()
310 except StopIteration:
311 raise ValueError("Failed to find end of BS block")
312
313
314
315
316
317
318
319 while True:
320
321
322 try:
323 while True:
324
325 if line.startswith("RD "):
326 break
327 line = handle.next()
328 except StopIteration:
329 raise ValueError("Failed to find RD line")
330
331 record.reads.append(Reads(line))
332
333 for line in handle:
334 line = line.strip()
335 if not line:
336 break
337 record.reads[-1].rd.sequence+=line
338
339 for line in handle:
340 if line.strip():
341 break
342 if not line.startswith("QA "):
343 raise ValueError("Failed to find QA line")
344 record.reads[-1].qa = qa(line)
345
346
347 for line in handle:
348 if line.strip():
349 break
350 else:
351 break
352
353 if line.startswith("DS "):
354 record.reads[-1].ds = ds(line)
355 line = ""
356
357
358 while True:
359
360 try:
361 while True:
362 if line.strip():
363 break
364 line = handle.next()
365 except StopIteration:
366
367 break
368 if line.startswith("RT{"):
369
370
371
372 if record.reads[-1].rt is None:
373 record.reads[-1].rt=[]
374 for line in handle:
375 line=line.strip()
376 if line=='}': break
377 record.reads[-1].rt.append(rt(line))
378 line = ""
379 elif line.startswith("WR{"):
380 if record.reads[-1].wr is None:
381 record.reads[-1].wr=[]
382 for line in handle:
383 line=line.strip()
384 if line=='}': break
385 record.reads[-1].wr.append(wr(line))
386 line = ""
387 elif line.startswith("WA{"):
388 if record.wa is None:
389 record.wa=[]
390 try:
391 line = handle.next()
392 except StopIteration:
393 raise ValueError("Failed to read WA block")
394 record.wa.append(wa(line))
395 for line in handle:
396 line=line.strip()
397 if line=='}': break
398 record.wa[-1].info.append(line)
399 line = ""
400 elif line.startswith("CT{"):
401 if record.ct is None:
402 record.ct=[]
403 try:
404 line = handle.next()
405 except StopIteration:
406 raise ValueError("Failed to read CT block")
407 record.ct.append(ct(line))
408 for line in handle:
409 line=line.strip()
410 if line=="COMMENT{":
411 for line in handle:
412 line = line.strip()
413 if line.endswith("C}"):
414 break
415 record.ct[-1].comment.append(line)
416 elif line=='}':
417 break
418 else:
419 record.ct[-1].info.append(line)
420 line = ""
421 else:
422 break
423
424 if not line.startswith('RD'):
425 break
426
427 yield record
428
430 """Holds data of an ACE file.
431 """
433 self.ncontigs=None
434 self.nreads=None
435 self.contigs=[]
436 self.wa=None
437
439 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
440
441 ct=[]
442 rt=[]
443 wr=[]
444
445 for i in range(len(self.contigs)):
446 c = self.contigs[i]
447 if c.wa:
448 if not self.wa:
449 self.wa=[]
450 self.wa.extend(c.wa)
451 if c.ct:
452 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
453 map(self.contigs[i].ct.remove,newcts)
454 ct.extend(newcts)
455 for j in range(len(c.reads)):
456 r = c.reads[j]
457 if r.rt:
458 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
459 map(self.contigs[i].reads[j].rt.remove,newrts)
460 rt.extend(newrts)
461 if r.wr:
462 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
463 map(self.contigs[i].reads[j].wr.remove,newwrs)
464 wr.extend(newwrs)
465
466 for i in range(len(self.contigs)):
467 c = self.contigs[i]
468 for ct_tag in ct:
469 if ct_tag.name==c.name:
470 if self.contigs[i].ct is None:
471 self.contigs[i].ct=[]
472 self.contigs[i].ct.append(ct_tag)
473 if rt or wr:
474 for j in range(len(c.reads)):
475 r = c.reads[j]
476 for rt_tag in rt:
477 if rt_tag.name==r.rd.name:
478 if self.contigs[i].reads[j].rt is None:
479 self.contigs[i].reads[j].rt=[]
480 self.contigs[i].reads[j].rt.append(rt_tag)
481 for wr_tag in wr:
482 if wr_tag.name==r.rd.name:
483 if self.contigs[i].reads[j].wr is None:
484 self.contigs[i].reads[j].wr=[]
485 self.contigs[i].reads[j].wr.append(wr_tag)
486
518