1
2
3
4
5
6
7 """Bio.SeqIO support for the "genbank" and "embl" file formats.
8
9 You are expected to use this module via the Bio.SeqIO functions.
10 Note that internally this module calls Bio.GenBank to do the actual
11 parsing of both GenBank and EMBL files.
12
13 See also:
14
15 International Nucleotide Sequence Database Collaboration
16 http://www.insdc.org/
17
18 GenBank
19 http://www.ncbi.nlm.nih.gov/Genbank/
20
21 EMBL Nucleotide Sequence Database
22 http://www.ebi.ac.uk/embl/
23
24 DDBJ (DNA Data Bank of Japan)
25 http://www.ddbj.nig.ac.jp/
26 """
27
28 from Bio.Seq import UnknownSeq
29 from Bio.GenBank.Scanner import GenBankScanner, EmblScanner
30 from Bio import Alphabet
31 from Interfaces import SequentialSequenceWriter
32 from Bio import SeqFeature
33
34
35
36
37
38
39
40
41
43 """Breaks up a Genbank file into SeqRecord objects.
44
45 Every section from the LOCUS line to the terminating // becomes
46 a single SeqRecord with associated annotation and features.
47
48 Note that for genomes or chromosomes, there is typically only
49 one record."""
50
51 return GenBankScanner(debug=0).parse_records(handle)
52
54 """Breaks up an EMBL file into SeqRecord objects.
55
56 Every section from the LOCUS line to the terminating // becomes
57 a single SeqRecord with associated annotation and features.
58
59 Note that for genomes or chromosomes, there is typically only
60 one record."""
61
62 return EmblScanner(debug=0).parse_records(handle)
63
65 """Breaks up a Genbank file into SeqRecord objects for each CDS feature.
66
67 Every section from the LOCUS line to the terminating // can contain
68 many CDS features. These are returned as with the stated amino acid
69 translation sequence (if given).
70 """
71
72 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
73
75 """Breaks up a EMBL file into SeqRecord objects for each CDS feature.
76
77 Every section from the LOCUS line to the terminating // can contain
78 many CDS features. These are returned as with the stated amino acid
79 translation sequence (if given).
80 """
81
82 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
83
85 """Build a GenBank/EMBL position string (PRIVATE).
86
87 Use offset=1 to add one to convert a start position from python counting.
88 """
89 if isinstance(pos, SeqFeature.ExactPosition) :
90 return "%i" % (pos.position+offset)
91 elif isinstance(pos, SeqFeature.WithinPosition) :
92 return "(%i.%i)" % (pos.position + offset,
93 pos.position + pos.extension + offset)
94 elif isinstance(pos, SeqFeature.BetweenPosition) :
95 return "(%i^%i)" % (pos.position + offset,
96 pos.position + pos.extension + offset)
97 elif isinstance(pos, SeqFeature.BeforePosition) :
98 return "<%i" % (pos.position + offset)
99 elif isinstance(pos, SeqFeature.AfterPosition) :
100 return ">%i" % (pos.position + offset)
101 elif isinstance(pos, SeqFeature.OneOfPosition):
102 return "one-of(%s)" \
103 % ",".join([_insdc_feature_position_string(p,offset) \
104 for p in pos.position_choices])
105 elif isinstance(pos, SeqFeature.AbstractPosition) :
106 raise NotImplementedError("Please report this as a bug in Biopython.")
107 else :
108 raise ValueError("Expected a SeqFeature position object.")
109
110
129
174
175
177 HEADER_WIDTH = 12
178 MAX_WIDTH = 80
179 QUALIFIER_INDENT = 21
180
188
216
225
227 """Write the LOCUS line."""
228
229 locus = record.name
230 if not locus or locus == "<unknown name>" :
231 locus = record.id
232 if not locus or locus == "<unknown id>" :
233 locus = self._get_annotation_str(record, "accession", just_first=True)
234 if len(locus) > 16 :
235 raise ValueError("Locus identifier %s is too long" % repr(locus))
236
237 if len(record) > 99999999999 :
238
239
240 raise ValueError("Sequence too long!")
241
242
243 a = Alphabet._get_base_alphabet(record.seq.alphabet)
244 if not isinstance(a, Alphabet.Alphabet) :
245 raise TypeError("Invalid alphabet")
246 elif isinstance(a, Alphabet.ProteinAlphabet) :
247 units = "aa"
248 elif isinstance(a, Alphabet.NucleotideAlphabet) :
249 units = "bp"
250 else :
251
252
253 raise ValueError("Need a Nucleotide or Protein alphabet")
254
255
256
257 if isinstance(a, Alphabet.ProteinAlphabet) :
258 mol_type = ""
259 elif isinstance(a, Alphabet.DNAAlphabet) :
260 mol_type = "DNA"
261 elif isinstance(a, Alphabet.RNAAlphabet) :
262 mol_type = "RNA"
263 else :
264
265
266 raise ValueError("Need a DNA, RNA or Protein alphabet")
267
268 try :
269 division = record.annotations["data_file_division"]
270 except KeyError :
271 division = "UNK"
272 if division not in ["PRI","ROD","MAM","VRT","INV","PLN","BCT",
273 "VRL","PHG","SYN","UNA","EST","PAT","STS",
274 "GSS","HTG","HTC","ENV","CON"] :
275 division = "UNK"
276
277 assert len(units) == 2
278 assert len(division) == 3
279
280
281 line = "LOCUS %s %s %s %s %s 01-JAN-1980\n" \
282 % (locus.ljust(16),
283 str(len(record)).rjust(11),
284 units,
285 mol_type.ljust(6),
286 division)
287 assert len(line) == 79+1, repr(line)
288
289 assert line[12:28].rstrip() == locus, \
290 'LOCUS line does not contain the locus at the expected position:\n' + line
291 assert line[28:29] == " "
292 assert line[29:40].lstrip() == str(len(record)), \
293 'LOCUS line does not contain the length at the expected position:\n' + line
294
295
296 assert line[40:44] in [' bp ', ' aa '] , \
297 'LOCUS line does not contain size units at expected position:\n' + line
298 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
299 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
300 assert line[47:54].strip() == "" \
301 or line[47:54].strip().find('DNA') != -1 \
302 or line[47:54].strip().find('RNA') != -1, \
303 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
304 assert line[54:55] == ' ', \
305 'LOCUS line does not contain space at position 55:\n' + line
306 assert line[55:63].strip() in ['','linear','circular'], \
307 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
308 assert line[63:64] == ' ', \
309 'LOCUS line does not contain space at position 64:\n' + line
310 assert line[67:68] == ' ', \
311 'LOCUS line does not contain space at position 68:\n' + line
312 assert line[70:71] == '-', \
313 'LOCUS line does not contain - at position 71 in date:\n' + line
314 assert line[74:75] == '-', \
315 'LOCUS line does not contain - at position 75 in date:\n' + line
316
317 self.handle.write(line)
318
320 """Get an annotation dictionary entry (as a string).
321
322 Some entries are lists, in which case if just_first=True the first entry
323 is returned. If just_first=False (default) this verifies there is only
324 one entry before returning it."""
325 try :
326 answer = record.annotations[key]
327 except KeyError :
328 return default
329 if isinstance(answer, list) :
330 if not just_first : assert len(answer) == 1
331 return str(answer[0])
332 else :
333 return str(answer)
334
336
337
338 LETTERS_PER_LINE = 60
339 SEQUENCE_INDENT = 9
340
341 if isinstance(record.seq, UnknownSeq) :
342
343
344 return
345
346 data = self._get_seq_string(record)
347 seq_len = len(data)
348 for line_number in range(0,seq_len,LETTERS_PER_LINE):
349 self.handle.write(str(line_number+1).rjust(SEQUENCE_INDENT))
350 for words in range(line_number,min(line_number+LETTERS_PER_LINE,seq_len),10):
351 self.handle.write(" %s" % data[words:words+10])
352 self.handle.write("\n")
353
355 """Write a single record to the output file."""
356 handle = self.handle
357 self._write_the_first_line(record)
358
359 accession = self._get_annotation_str(record, "accession",
360 record.id.split(".",1)[0],
361 just_first=True)
362 acc_with_version = accession
363 if record.id.startswith(accession+".") :
364 try :
365 acc_with_version = "%s.%i" \
366 % (accession, int(record.id.split(".",1)[1]))
367 except ValueError :
368 pass
369 gi = self._get_annotation_str(record, "gi", just_first=True)
370
371 descr = record.description
372 if descr == "<unknown description>" : descr = "."
373 self._write_multi_line("DEFINITION", descr)
374
375 self._write_single_line("ACCESSION", accession)
376 if gi != "." :
377 self._write_single_line("VERSION", "%s GI:%s" % (acc_with_version,gi))
378 else :
379 self._write_single_line("VERSION", "%s" % (acc_with_version))
380
381
382
383
384 self._write_multi_entries("DBLINK", record.dbxrefs)
385
386 try :
387
388 keywords = "; ".join(record.annotations["keywords"])
389 except KeyError :
390 keywords = "."
391 self._write_multi_line("KEYWORDS", keywords)
392
393 if "segment" in record.annotations :
394
395
396 segment = record.annotations["segment"]
397 if isinstance(segment, list) :
398 assert len(segment)==1, segment
399 segment = segment[0]
400 self._write_single_line("SEGMENT", segment)
401
402 self._write_multi_line("SOURCE", \
403 self._get_annotation_str(record, "source"))
404
405 org = self._get_annotation_str(record, "organism")
406 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH :
407 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH-4]+"..."
408 self._write_single_line(" ORGANISM", org)
409 try :
410
411 taxonomy = "; ".join(record.annotations["taxonomy"])
412 except KeyError :
413 taxonomy = "."
414 self._write_multi_line("", taxonomy)
415
416
417 handle.write("FEATURES Location/Qualifiers\n")
418 for feature in record.features :
419 self._write_feature(feature)
420 handle.write("ORIGIN\n")
421 self._write_sequence(record)
422 handle.write("//\n")
423
425 if not value :
426 self.handle.write("%s/%s\n" % (" "*self.QUALIFIER_INDENT, key))
427 return
428
429
430 if quote is None :
431
432 if isinstance(value, int) or isinstance(value, long) :
433 quote = False
434 else :
435 quote = True
436 if quote :
437 line = '%s/%s="%s"' % (" "*self.QUALIFIER_INDENT, key, value)
438 else :
439 line = '%s/%s=%s' % (" "*self.QUALIFIER_INDENT, key, value)
440 if len(line) < self.MAX_WIDTH :
441 self.handle.write(line+"\n")
442 return
443 while line.lstrip() :
444 if len(line) < self.MAX_WIDTH :
445 self.handle.write(line+"\n")
446 return
447
448 for index in range(min(len(line)-1,self.MAX_WIDTH),self.QUALIFIER_INDENT+1,-1) :
449 if line[index]==" " : break
450 if line[index] != " " :
451
452 index = self.MAX_WIDTH
453 self.handle.write(line[:index] + "\n")
454 line = " "*self.QUALIFIER_INDENT + line[index:].lstrip()
455
470
490
491
492 if __name__ == "__main__" :
493 print "Quick self test"
494 import os
495 from StringIO import StringIO
496
517
519 """Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
520 if len(old_list) != len(new_list) :
521 raise ValueError("%i vs %i records" % (len(old_list), len(new_list)))
522 for old, new in zip(old_list, new_list) :
523 if not compare_record(old,new) :
524 return False
525 return True
526
528 """Check two SeqFeatures agree."""
529 if old.type != new.type :
530 raise ValueError("Type %s versus %s" % (old.type, new.type))
531 if old.location.nofuzzy_start != new.location.nofuzzy_start \
532 or old.location.nofuzzy_end != new.location.nofuzzy_end :
533 raise ValueError("%s versus %s:\n%s\nvs:\n%s" \
534 % (old.location, new.location, str(old), str(new)))
535 if old.strand != new.strand :
536 raise ValueError("Different strand:\n%s\nvs:\n%s" % (str(old), str(new)))
537 if old.location.start != new.location.start :
538 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" \
539 % (old.location.start, new.location.start, str(old), str(new)))
540 if old.location.end != new.location.end :
541 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" \
542 % (old.location.end, new.location.end, str(old), str(new)))
543 if not ignore_sub_features :
544 if len(old.sub_features) != len(new.sub_features) :
545 raise ValueError("Different sub features")
546 for a,b in zip(old.sub_features, new.sub_features) :
547 if not compare_feature(a,b) :
548 return False
549
550
551
552 for key in set(old.qualifiers.keys()).intersection(new.qualifiers.keys()):
553 if key in ["db_xref","protein_id","product","note"] :
554
555 continue
556 if old.qualifiers[key] != new.qualifiers[key] :
557 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" \
558 % (key, old.qualifiers[key], new.qualifiers[key]))
559 return True
560
562 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch."""
563 if len(old_list) != len(new_list) :
564 raise ValueError("%i vs %i features" % (len(old_list), len(new_list)))
565 for old, new in zip(old_list, new_list) :
566
567 if not compare_feature(old,new,ignore_sub_features) :
568 return False
569 return True
570
578
579 for filename in os.listdir("../../Tests/GenBank") :
580 if not filename.endswith(".gbk") and not filename.endswith(".gb") :
581 continue
582 print filename
583
584 handle = open("../../Tests/GenBank/%s" % filename)
585 records = list(GenBankIterator(handle))
586 handle.close()
587
588 check_genbank_writer(records)
589
590 for filename in os.listdir("../../Tests/EMBL") :
591 if not filename.endswith(".embl") :
592 continue
593 print filename
594
595 handle = open("../../Tests/EMBL/%s" % filename)
596 records = list(EmblIterator(handle))
597 handle.close()
598
599 check_genbank_writer(records)
600
601 from Bio import SeqIO
602 for filename in os.listdir("../../Tests/SwissProt") :
603 if not filename.startswith("sp") :
604 continue
605 print filename
606
607 handle = open("../../Tests/SwissProt/%s" % filename)
608 records = list(SeqIO.parse(handle,"swiss"))
609 handle.close()
610
611 check_genbank_writer(records)
612