1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import sys
28 import os
29 from Bio.Seq import Seq
30 from Bio.SeqRecord import SeqRecord
31 from Bio.Alphabet import generic_alphabet, generic_protein
32
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
35
36 The International Nucleotide Sequence Database Collaboration (INSDC)
37 between the DDBJ, EMBL, and GenBank. These organisations all use the
38 same "Feature Table" layout in their plain text flat file formats.
39
40 However, the header and sequence sections of an EMBL file are very
41 different in layout to those produced by GenBank/DDBJ."""
42
43
44 RECORD_START = "XXX"
45 HEADER_WIDTH = 3
46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
48 FEATURE_QUALIFIER_INDENT = 0
49 FEATURE_QUALIFIER_SPACER = ""
50 SEQUENCE_HEADERS=["XXX"]
51
59
63
65 """Read in lines until find the ID/LOCUS line, which is returned.
66
67 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
68 will we ignored."""
69 while True :
70 if self.line :
71 line = self.line
72 self.line = ""
73 else :
74 line = self.handle.readline()
75 if not line :
76 if self.debug : print "End of file"
77 return None
78 if line[:self.HEADER_WIDTH]==self.RECORD_START :
79 if self.debug > 1: print "Found the start of a record:\n" + line
80 break
81 line = line.rstrip()
82 if line == "//" :
83 if self.debug > 1: print "Skipping // marking end of last record"
84 elif line == "" :
85 if self.debug > 1: print "Skipping blank line before record"
86 else :
87
88 if self.debug > 1:
89 print "Skipping header line before record:\n" + line
90 self.line = line
91 return line
92
94 """Return list of strings making up the header
95
96 New line characters are removed.
97
98 Assumes you have just read in the ID/LOCUS line.
99 """
100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
101 "Not at start of record"
102
103 header_lines = []
104 while True :
105 line = self.handle.readline()
106 if not line :
107 raise ValueError("Premature end of line during sequence data")
108 line = line.rstrip()
109 if line in self.FEATURE_START_MARKERS :
110 if self.debug : print "Found header table"
111 break
112
113
114
115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS :
116 if self.debug : print "Found start of sequence"
117 break
118 if line == "//" :
119 raise ValueError("Premature end of sequence data marker '//' found")
120 header_lines.append(line)
121 self.line = line
122 return header_lines
123
175
177 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
178
179 For example given this GenBank feature:
180
181 CDS complement(join(490883..490885,1..879))
182 /locus_tag="NEQ001"
183 /note="conserved hypothetical [Methanococcus jannaschii];
184 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
185 localization signal; IPR002743: Protein of unknown
186 function DUF57"
187 /codon_start=1
188 /transl_table=11
189 /product="hypothetical protein"
190 /protein_id="NP_963295.1"
191 /db_xref="GI:41614797"
192 /db_xref="GeneID:2732620"
193 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
194 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
195 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
196 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
197 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
198 LNSMGFGFVNTKKNSAR"
199
200 Then should give input key="CDS" and the rest of the data as a list of strings
201 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
202 where the leading spaces and trailing newlines have been removed.
203
204 Returns tuple containing: (key as string, location string, qualifiers as list)
205 as follows for this example:
206
207 key = "CDS", string
208 location = "complement(join(490883..490885,1..879))", string
209 qualifiers = list of string tuples:
210
211 [('locus_tag', '"NEQ001"'),
212 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
213 ('codon_start', '1'),
214 ('transl_table', '11'),
215 ('product', '"hypothetical protein"'),
216 ('protein_id', '"NP_963295.1"'),
217 ('db_xref', '"GI:41614797"'),
218 ('db_xref', '"GeneID:2732620"'),
219 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
220
221 In the above example, the "note" and "translation" were edited for compactness,
222 and they would contain multiple new line characters (displayed above as \n)
223
224 If a qualifier is quoted (in this case, everything except codon_start and
225 transl_table) then the quotes are NOT removed.
226
227 Note that no whitespace is removed.
228 """
229
230 iterator = iter(filter(None, lines))
231 try :
232 line = iterator.next()
233
234 feature_location = line.strip()
235 while feature_location[-1:]=="," :
236
237 feature_location += iterator.next().strip()
238
239 qualifiers=[]
240
241 for line in iterator :
242 if line[0]=="/" :
243
244 i = line.find("=")
245 key = line[1:i]
246 value = line[i+1:]
247 if i==-1 :
248
249 key = line[1:]
250 qualifiers.append((key,None))
251 elif value[0]=='"' :
252
253 if value[-1]!='"' or value!='"' :
254
255 while value[-1] != '"' :
256 value += "\n" + iterator.next()
257 else :
258
259 assert value == '"'
260 if self.debug : print "Quoted line %s:%s" % (key, value)
261
262 qualifiers.append((key,value))
263 else :
264
265
266 qualifiers.append((key,value))
267 else :
268
269 assert len(qualifiers) > 0
270 assert key==qualifiers[-1][0]
271
272 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
273 return (feature_key, feature_location, qualifiers)
274 except StopIteration:
275
276 raise ValueError("Problem with '%s' feature:\n%s" \
277 % (feature_key, "\n".join(lines)))
278
299
301 """Handle the LOCUS/ID line, passing data to the comsumer
302
303 This should be implemented by the EMBL / GenBank specific subclass
304
305 Used by the parse_records() and parse() methods.
306 """
307 pass
308
310 """Handle the header lines (list of strings), passing data to the comsumer
311
312 This should be implemented by the EMBL / GenBank specific subclass
313
314 Used by the parse_records() and parse() methods.
315 """
316 pass
317
318
332
334 """Handle any lines between features and sequence (list of strings), passing data to the consumer
335
336 This should be implemented by the EMBL / GenBank specific subclass
337
338 Used by the parse_records() and parse() methods.
339 """
340 pass
341
342 - def feed(self, handle, consumer, do_features=True) :
343 """Feed a set of data into the consumer.
344
345 This method is intended for use with the "old" code in Bio.GenBank
346
347 Arguments:
348 handle - A handle with the information to parse.
349 consumer - The consumer that should be informed of events.
350 do_features - Boolean, should the features be parsed?
351 Skipping the features can be much faster.
352
353 Return values:
354 true - Passed a record
355 false - Did not find a record
356 """
357
358
359 self.set_handle(handle)
360 if not self.find_start() :
361
362 consumer.data=None
363 return False
364
365
366
367
368
369
370 self._feed_first_line(consumer, self.line)
371 self._feed_header_lines(consumer, self.parse_header())
372
373
374 if do_features :
375 self._feed_feature_table(consumer, self.parse_features(skip=False))
376 else :
377 self.parse_features(skip=True)
378
379
380 misc_lines, sequence_string = self.parse_footer()
381 self._feed_misc_lines(consumer, misc_lines)
382
383 consumer.sequence(sequence_string)
384
385 consumer.record_end("//")
386
387 assert self.line == "//"
388
389
390 return True
391
407
408
410 """Returns a SeqRecord object iterator
411
412 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
413
414 The SeqRecord objects include SeqFeatures if do_features=True
415
416 This method is intended for use in Bio.SeqIO
417 """
418
419 while True :
420 record = self.parse(handle)
421 if record is None : break
422 assert record.id is not None
423 assert record.name != "<unknown name>"
424 assert record.description != "<unknown description>"
425 yield record
426
430 """Returns SeqRecord object iterator
431
432 Each CDS feature becomes a SeqRecord.
433
434 alphabet - Used for any sequence found in a translation field.
435 tags2id - Tupple of three strings, the feature keys to use
436 for the record id, name and description,
437
438 This method is intended for use in Bio.SeqIO
439 """
440 self.set_handle(handle)
441 while self.find_start() :
442
443 self.parse_header()
444 feature_tuples = self.parse_features()
445
446 while True :
447 line = self.handle.readline()
448 if not line : break
449 if line[:2]=="//" : break
450 self.line = line.rstrip()
451
452
453 for key, location_string, qualifiers in feature_tuples :
454 if key=="CDS" :
455
456
457
458
459
460 record = SeqRecord(seq=None)
461 annotations = record.annotations
462
463
464
465
466 annotations['raw_location'] = location_string.replace(' ','')
467
468 for (qualifier_name, qualifier_data) in qualifiers :
469 if qualifier_data is not None \
470 and qualifier_data[0]=='"' and qualifier_data[-1]=='"' :
471
472 qualifier_data = qualifier_data[1:-1]
473
474 if qualifier_name == "translation" :
475 assert record.seq is None, "Multiple translations!"
476 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
477 elif qualifier_name == "db_xref" :
478
479 record.dbxrefs.append(qualifier_data)
480 else :
481 if qualifier_data is not None :
482 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
483 try :
484 annotations[qualifier_name] += " " + qualifier_data
485 except KeyError :
486
487 annotations[qualifier_name]= qualifier_data
488
489
490
491 try :
492 record.id = annotations[tags2id[0]]
493 except KeyError :
494 pass
495 try :
496 record.name = annotations[tags2id[1]]
497 except KeyError :
498 pass
499 try :
500 record.description = annotations[tags2id[2]]
501 except KeyError :
502 pass
503
504 yield record
505
507 """For extracting chunks of information in EMBL files"""
508
509 RECORD_START = "ID "
510 HEADER_WIDTH = 5
511 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
512 FEATURE_END_MARKERS = ["XX"]
513 FEATURE_QUALIFIER_INDENT = 21
514 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
515 SEQUENCE_HEADERS=["SQ"]
516
550
561
563
564
565
566 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
567 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
568 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
569 fields = [entry.strip() for entry in fields]
570 """
571 The tokens represent:
572 0. Primary accession number
573 (space sep)
574 1. ??? (e.g. standard)
575 (semi-colon)
576 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
577 3. Taxonomic division (e.g. 'PRO')
578 4. Sequence length (e.g. '4639675 BP.')
579 """
580 consumer.locus(fields[0])
581 consumer.residue_type(fields[2])
582 consumer.data_file_division(fields[3])
583 self._feed_seq_length(consumer, fields[4])
584
586
587
588
589 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
590 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
591 assert len(fields) == 7
592 """
593 The tokens represent:
594 0. Primary accession number
595 1. Sequence version number
596 2. Topology: 'circular' or 'linear'
597 3. Molecule type (e.g. 'genomic DNA')
598 4. Data class (e.g. 'STD')
599 5. Taxonomic division (e.g. 'PRO')
600 6. Sequence length (e.g. '4639675 BP.')
601 """
602
603 consumer.locus(fields[0])
604
605
606
607 consumer.accession(fields[0])
608
609
610
611 version_parts = fields[1].split()
612 if len(version_parts)==2 \
613 and version_parts[0]=="SV" \
614 and version_parts[1].isdigit() :
615 consumer.version_suffix(version_parts[1])
616
617
618 consumer.residue_type(" ".join(fields[2:4]))
619
620
621
622 consumer.data_file_division(fields[5])
623
624 self._feed_seq_length(consumer, fields[6])
625
627 length_parts = text.split()
628 assert len(length_parts) == 2
629 assert length_parts[1].upper() in ["BP", "BP."]
630 consumer.size(length_parts[0])
631
633 EMBL_INDENT = self.HEADER_WIDTH
634 EMBL_SPACER = " " * EMBL_INDENT
635 consumer_dict = {
636 'AC' : 'accession',
637 'SV' : 'version',
638 'DE' : 'definition',
639
640
641
642 'RA' : 'authors',
643 'RT' : 'title',
644 'RL' : 'journal',
645 'OS' : 'organism',
646 'OC' : 'taxonomy',
647
648 'CC' : 'comment',
649
650 }
651
652
653 lines = filter(None,lines)
654 line_iter = iter(lines)
655 try :
656 while True :
657 try :
658 line = line_iter.next()
659 except StopIteration :
660 break
661 if not line : break
662 line_type = line[:EMBL_INDENT].strip()
663 data = line[EMBL_INDENT:].strip()
664
665 if line_type == 'XX' :
666 pass
667 elif line_type == 'RN' :
668
669
670 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
671 consumer.reference_num(data)
672 elif line_type == 'RP' :
673
674
675 assert data.count("-")==1
676 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")")
677 elif line_type == 'RX' :
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693 key, value = data.split(";",1)
694 if value.endswith(".") : value = value[:-1]
695 value = value.strip()
696 if key == "PUBMED" :
697 consumer.pubmed_id(value)
698
699 elif line_type == 'CC' :
700
701 consumer.comment([data])
702 elif line_type == 'DR' :
703
704
705
706
707
708
709
710
711 pass
712 elif line_type in consumer_dict :
713
714 getattr(consumer, consumer_dict[line_type])(data)
715 else :
716 if self.debug :
717 print "Ignoring EMBL header line:\n%s" % line
718 except StopIteration :
719 raise ValueError("Problem with header")
720
724
726 """For extracting chunks of information in GenBank files"""
727
728 RECORD_START = "LOCUS "
729 HEADER_WIDTH = 12
730 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
731 FEATURE_END_MARKERS = []
732 FEATURE_QUALIFIER_INDENT = 21
733 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
734 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
735
778
780
781
782
783 GENBANK_INDENT = self.HEADER_WIDTH
784 GENBANK_SPACER = " "*GENBANK_INDENT
785 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
786 'LOCUS line does not start correctly:\n' + line
787
788
789
790 if line[29:33] in [' bp ', ' aa ',' rc '] :
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809 assert line[29:33] in [' bp ', ' aa ',' rc '] , \
810 'LOCUS line does not contain size units at expected position:\n' + line
811 assert line[41:42] == ' ', \
812 'LOCUS line does not contain space at position 42:\n' + line
813 assert line[42:51].strip() in ['','linear','circular'], \
814 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
815 assert line[51:52] == ' ', \
816 'LOCUS line does not contain space at position 52:\n' + line
817 assert line[55:62] == ' ', \
818 'LOCUS line does not contain spaces from position 56 to 62:\n' + line
819 assert line[64:65] == '-', \
820 'LOCUS line does not contain - at position 65 in date:\n' + line
821 assert line[68:69] == '-', \
822 'LOCUS line does not contain - at position 69 in date:\n' + line
823
824 name_and_length_str = line[GENBANK_INDENT:29]
825 while name_and_length_str.find(' ')!=-1 :
826 name_and_length_str = name_and_length_str.replace(' ',' ')
827 name_and_length = name_and_length_str.split(' ')
828 assert len(name_and_length)<=2, \
829 'Cannot parse the name and length in the LOCUS line:\n' + line
830 assert len(name_and_length)!=1, \
831 'Name and length collide in the LOCUS line:\n' + line
832
833
834
835 consumer.locus(name_and_length[0])
836 consumer.size(name_and_length[1])
837
838
839 if line[33:51].strip() == "" and line[29:33] == ' aa ' :
840
841
842
843
844 consumer.residue_type("PROTEIN")
845 else :
846 consumer.residue_type(line[33:51].strip())
847
848 consumer.data_file_division(line[52:55])
849 consumer.date(line[62:73])
850 elif line[40:44] in [' bp ', ' aa ',' rc '] :
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
871 'LOCUS line does not contain size units at expected position:\n' + line
872 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
873 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
874 assert line[47:54].strip() == "" \
875 or line[47:54].strip().find('DNA') != -1 \
876 or line[47:54].strip().find('RNA') != -1, \
877 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
878 assert line[54:55] == ' ', \
879 'LOCUS line does not contain space at position 55:\n' + line
880 assert line[55:63].strip() in ['','linear','circular'], \
881 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
882 assert line[63:64] == ' ', \
883 'LOCUS line does not contain space at position 64:\n' + line
884 assert line[67:68] == ' ', \
885 'LOCUS line does not contain space at position 68:\n' + line
886 assert line[70:71] == '-', \
887 'LOCUS line does not contain - at position 71 in date:\n' + line
888 assert line[74:75] == '-', \
889 'LOCUS line does not contain - at position 75 in date:\n' + line
890
891 name_and_length_str = line[GENBANK_INDENT:40]
892 while name_and_length_str.find(' ')!=-1 :
893 name_and_length_str = name_and_length_str.replace(' ',' ')
894 name_and_length = name_and_length_str.split(' ')
895 assert len(name_and_length)<=2, \
896 'Cannot parse the name and length in the LOCUS line:\n' + line
897 assert len(name_and_length)!=1, \
898 'Name and length collide in the LOCUS line:\n' + line
899
900
901
902 consumer.locus(name_and_length[0])
903 consumer.size(name_and_length[1])
904
905 if line[44:54].strip() == "" and line[40:44] == ' aa ' :
906
907
908
909
910 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
911 else :
912 consumer.residue_type(line[44:63].strip())
913
914 consumer.data_file_division(line[64:67])
915 consumer.date(line[68:79])
916 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932 if line[GENBANK_INDENT:].strip() != "" :
933 consumer.locus(line[GENBANK_INDENT:].strip())
934 else :
935
936
937 print >> sys.stderr, "Warning: Minimal LOCUS line found - is this correct?\n" + line
938 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"] :
939
940
941 consumer.locus(line.split()[1])
942 consumer.size(line.split()[2])
943 print >> sys.stderr, "Warning: Malformed LOCUS line found - is this correct?\n" + line
944 else :
945 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
946
947
949
950
951
952
953 GENBANK_INDENT = self.HEADER_WIDTH
954 GENBANK_SPACER = " "*GENBANK_INDENT
955 consumer_dict = {
956 'DEFINITION' : 'definition',
957 'ACCESSION' : 'accession',
958 'NID' : 'nid',
959 'PID' : 'pid',
960 'DBSOURCE' : 'db_source',
961 'KEYWORDS' : 'keywords',
962 'SEGMENT' : 'segment',
963 'SOURCE' : 'source',
964 'AUTHORS' : 'authors',
965 'CONSRTM' : 'consrtm',
966 'PROJECT' : 'project',
967 'DBLINK' : 'dblink',
968 'TITLE' : 'title',
969 'JOURNAL' : 'journal',
970 'MEDLINE' : 'medline_id',
971 'PUBMED' : 'pubmed_id',
972 'REMARK' : 'remark'}
973
974
975
976
977
978
979 lines = filter(None,lines)
980 lines.append("")
981 line_iter = iter(lines)
982 try :
983 line = line_iter.next()
984 while True :
985 if not line : break
986 line_type = line[:GENBANK_INDENT].strip()
987 data = line[GENBANK_INDENT:].strip()
988
989 if line_type == 'VERSION' :
990
991
992
993 while data.find(' ')!=-1:
994 data = data.replace(' ',' ')
995 if data.find(' GI:')==-1 :
996 consumer.version(data)
997 else :
998 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
999 consumer.version(data.split(' GI:')[0])
1000 consumer.gi(data.split(' GI:')[1])
1001
1002 line = line_iter.next()
1003 elif line_type == 'REFERENCE' :
1004 if self.debug >1 : print "Found reference [" + data + "]"
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015 data = data.strip()
1016
1017
1018 while True:
1019 line = line_iter.next()
1020 if line[:GENBANK_INDENT] == GENBANK_SPACER :
1021
1022 data += " " + line[GENBANK_INDENT:]
1023 if self.debug >1 : print "Extended reference text [" + data + "]"
1024 else :
1025
1026 break
1027
1028
1029
1030 while data.find(' ')!=-1:
1031 data = data.replace(' ',' ')
1032 if data.find(' ')==-1 :
1033 if self.debug >2 : print 'Reference number \"' + data + '\"'
1034 consumer.reference_num(data)
1035 else :
1036 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1037 consumer.reference_num(data[:data.find(' ')])
1038 consumer.reference_bases(data[data.find(' ')+1:])
1039 elif line_type == 'ORGANISM' :
1040
1041
1042
1043
1044
1045
1046
1047
1048 organism_data = data
1049 lineage_data = ""
1050 while True :
1051 line = line_iter.next()
1052 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1053 if lineage_data or ";" in line :
1054 lineage_data += " " + line[GENBANK_INDENT:]
1055 else :
1056 organism_data += " " + line[GENBANK_INDENT:].strip()
1057 else :
1058
1059 break
1060 consumer.organism(organism_data)
1061 if lineage_data.strip() == "" and self.debug > 1 :
1062 print "Taxonomy line(s) missing or blank"
1063 consumer.taxonomy(lineage_data.strip())
1064 del organism_data, lineage_data
1065 elif line_type == 'COMMENT' :
1066 if self.debug > 1 : print "Found comment"
1067
1068
1069 comment_list=[]
1070 comment_list.append(data)
1071 while True:
1072 line = line_iter.next()
1073 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1074 data = line[GENBANK_INDENT:]
1075 comment_list.append(data)
1076 if self.debug > 2 : print "Comment continuation [" + data + "]"
1077 else :
1078
1079 break
1080 consumer.comment(comment_list)
1081 del comment_list
1082 elif line_type in consumer_dict :
1083
1084
1085 while True :
1086 line = line_iter.next()
1087 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1088 data += ' ' + line[GENBANK_INDENT:]
1089 else :
1090
1091 getattr(consumer, consumer_dict[line_type])(data)
1092
1093 break
1094 else :
1095 if self.debug :
1096 print "Ignoring GenBank header line:\n" % line
1097
1098 line = line_iter.next()
1099 except StopIteration :
1100 raise ValueError("Problem in header")
1101
1142
1143 if __name__ == "__main__" :
1144 from StringIO import StringIO
1145
1146 gbk_example = \
1147 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1148 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1149 (AXL2) and Rev7p (REV7) genes, complete cds.
1150 ACCESSION U49845
1151 VERSION U49845.1 GI:1293613
1152 KEYWORDS .
1153 SOURCE Saccharomyces cerevisiae (baker's yeast)
1154 ORGANISM Saccharomyces cerevisiae
1155 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1156 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1157 REFERENCE 1 (bases 1 to 5028)
1158 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1159 TITLE Cloning and sequence of REV7, a gene whose function is required for
1160 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1161 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1162 PUBMED 7871890
1163 REFERENCE 2 (bases 1 to 5028)
1164 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1165 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1166 plasma membrane glycoprotein
1167 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1168 PUBMED 8846915
1169 REFERENCE 3 (bases 1 to 5028)
1170 AUTHORS Roemer,T.
1171 TITLE Direct Submission
1172 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1173 Haven, CT, USA
1174 FEATURES Location/Qualifiers
1175 source 1..5028
1176 /organism="Saccharomyces cerevisiae"
1177 /db_xref="taxon:4932"
1178 /chromosome="IX"
1179 /map="9"
1180 CDS <1..206
1181 /codon_start=3
1182 /product="TCP1-beta"
1183 /protein_id="AAA98665.1"
1184 /db_xref="GI:1293614"
1185 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1186 AEVLLRVDNIIRARPRTANRQHM"
1187 gene 687..3158
1188 /gene="AXL2"
1189 CDS 687..3158
1190 /gene="AXL2"
1191 /note="plasma membrane glycoprotein"
1192 /codon_start=1
1193 /function="required for axial budding pattern of S.
1194 cerevisiae"
1195 /product="Axl2p"
1196 /protein_id="AAA98666.1"
1197 /db_xref="GI:1293615"
1198 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1199 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1200 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1201 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1202 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1203 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1204 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1205 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1206 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1207 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1208 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1209 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1210 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1211 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1212 VDFSNKSNVNVGQVKDIHGRIPEML"
1213 gene complement(3300..4037)
1214 /gene="REV7"
1215 CDS complement(3300..4037)
1216 /gene="REV7"
1217 /codon_start=1
1218 /product="Rev7p"
1219 /protein_id="AAA98667.1"
1220 /db_xref="GI:1293616"
1221 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1222 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1223 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1224 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1225 LISGDDKILNGVYSQYEEGESIFGSLF"
1226 ORIGIN
1227 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1228 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1229 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1230 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1231 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1232 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1233 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1234 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1235 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1236 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1237 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1238 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1239 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1240 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1241 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1242 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1243 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1244 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1245 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1246 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1247 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1248 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1249 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1250 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1251 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1252 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1253 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1254 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1255 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1256 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1257 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1258 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1259 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1260 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1261 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1262 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1263 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1264 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1265 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1266 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1267 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1268 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1269 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1270 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1271 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1272 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1273 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1274 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1275 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1276 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1277 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1278 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1279 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1280 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1281 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1282 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1283 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1284 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1285 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1286 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1287 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1288 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1289 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1290 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1291 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1292 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1293 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1294 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1295 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1296 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1297 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1298 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1299 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1300 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1301 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1302 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1303 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1304 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1305 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1306 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1307 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1308 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1309 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1310 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1311 //"""
1312
1313
1314
1315 gbk_example2 = \
1316 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1317 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1318 ACCESSION AAD51968
1319 VERSION AAD51968.1 GI:5805369
1320 DBSOURCE locus AF171097 accession AF171097.1
1321 KEYWORDS .
1322 SOURCE Yersinia enterocolitica
1323 ORGANISM Yersinia enterocolitica
1324 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1325 Enterobacteriaceae; Yersinia.
1326 REFERENCE 1 (residues 1 to 143)
1327 AUTHORS Revell,P.A. and Miller,V.L.
1328 TITLE A chromosomally encoded regulator is required for expression of the
1329 Yersinia enterocolitica inv gene and for virulence
1330 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1331 MEDLINE 20138369
1332 PUBMED 10672189
1333 REFERENCE 2 (residues 1 to 143)
1334 AUTHORS Revell,P.A. and Miller,V.L.
1335 TITLE Direct Submission
1336 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1337 University School of Medicine, Campus Box 8230, 660 South Euclid,
1338 St. Louis, MO 63110, USA
1339 COMMENT Method: conceptual translation.
1340 FEATURES Location/Qualifiers
1341 source 1..143
1342 /organism="Yersinia enterocolitica"
1343 /mol_type="unassigned DNA"
1344 /strain="JB580v"
1345 /serotype="O:8"
1346 /db_xref="taxon:630"
1347 Protein 1..143
1348 /product="transcriptional regulator RovA"
1349 /name="regulates inv expression"
1350 CDS 1..143
1351 /gene="rovA"
1352 /coded_by="AF171097.1:380..811"
1353 /note="regulator of virulence"
1354 /transl_table=11
1355 ORIGIN
1356 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1357 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1358 121 deiellsgli dklerniiql qsk
1359 //
1360 """
1361
1362 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1363 XX
1364 AC X56734; S46826;
1365 XX
1366 DT 12-SEP-1991 (Rel. 29, Created)
1367 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1368 XX
1369 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1370 XX
1371 KW beta-glucosidase.
1372 XX
1373 OS Trifolium repens (white clover)
1374 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1375 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1376 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1377 XX
1378 RN [5]
1379 RP 1-1859
1380 RX PUBMED; 1907511.
1381 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1382 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1383 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1384 RL Plant Mol. Biol. 17(2):209-219(1991).
1385 XX
1386 RN [6]
1387 RP 1-1859
1388 RA Hughes M.A.;
1389 RT ;
1390 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1391 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1392 RL Upon Tyne, NE2 4HH, UK
1393 XX
1394 FH Key Location/Qualifiers
1395 FH
1396 FT source 1..1859
1397 FT /organism="Trifolium repens"
1398 FT /mol_type="mRNA"
1399 FT /clone_lib="lambda gt10"
1400 FT /clone="TRE361"
1401 FT /tissue_type="leaves"
1402 FT /db_xref="taxon:3899"
1403 FT CDS 14..1495
1404 FT /product="beta-glucosidase"
1405 FT /EC_number="3.2.1.21"
1406 FT /note="non-cyanogenic"
1407 FT /db_xref="GOA:P26204"
1408 FT /db_xref="InterPro:IPR001360"
1409 FT /db_xref="InterPro:IPR013781"
1410 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1411 FT /protein_id="CAA40058.1"
1412 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1413 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1414 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1415 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1416 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1417 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1418 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1419 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1420 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1421 FT mRNA 1..1859
1422 FT /experiment="experimental evidence, no additional details
1423 FT recorded"
1424 XX
1425 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1426 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1427 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1428 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1429 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1430 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1431 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1432 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1433 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1434 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1435 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1436 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1437 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1438 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1439 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1440 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1441 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1442 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1443 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1444 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1445 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1446 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1447 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1448 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1449 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1450 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1451 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1452 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1453 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1454 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1455 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1456 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1457 //
1458 """
1459
1460 print "GenBank CDS Iteration"
1461 print "====================="
1462
1463 g = GenBankScanner()
1464 for record in g.parse_cds_features(StringIO(gbk_example)) :
1465 print record
1466
1467 g = GenBankScanner()
1468 for record in g.parse_cds_features(StringIO(gbk_example2),
1469 tags2id=('gene','locus_tag','product')) :
1470 print record
1471
1472 g = GenBankScanner()
1473 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1474 tags2id=('gene','locus_tag','product')) :
1475 print record
1476
1477 print
1478 print "GenBank Iteration"
1479 print "================="
1480 g = GenBankScanner()
1481 for record in g.parse_records(StringIO(gbk_example),do_features=False) :
1482 print record.id, record.name, record.description
1483 print record.seq
1484
1485 g = GenBankScanner()
1486 for record in g.parse_records(StringIO(gbk_example),do_features=True) :
1487 print record.id, record.name, record.description
1488 print record.seq
1489
1490 g = GenBankScanner()
1491 for record in g.parse_records(StringIO(gbk_example2),do_features=False) :
1492 print record.id, record.name, record.description
1493 print record.seq
1494
1495 g = GenBankScanner()
1496 for record in g.parse_records(StringIO(gbk_example2),do_features=True) :
1497 print record.id, record.name, record.description
1498 print record.seq
1499
1500 print
1501 print "EMBL CDS Iteration"
1502 print "=================="
1503
1504 e = EmblScanner()
1505 for record in e.parse_cds_features(StringIO(embl_example)) :
1506 print record
1507
1508 print
1509 print "EMBL Iteration"
1510 print "=============="
1511 e = EmblScanner()
1512 for record in e.parse_records(StringIO(embl_example),do_features=True) :
1513 print record.id, record.name, record.description
1514 print record.seq
1515