1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import warnings
28 import os
29 from Bio.Seq import Seq
30 from Bio.SeqRecord import SeqRecord
31 from Bio.Alphabet import generic_alphabet, generic_protein
32
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
35
36 The International Nucleotide Sequence Database Collaboration (INSDC)
37 between the DDBJ, EMBL, and GenBank. These organisations all use the
38 same "Feature Table" layout in their plain text flat file formats.
39
40 However, the header and sequence sections of an EMBL file are very
41 different in layout to those produced by GenBank/DDBJ."""
42
43
44 RECORD_START = "XXX"
45 HEADER_WIDTH = 3
46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
48 FEATURE_QUALIFIER_INDENT = 0
49 FEATURE_QUALIFIER_SPACER = ""
50 SEQUENCE_HEADERS=["XXX"]
51
59
63
65 """Read in lines until find the ID/LOCUS line, which is returned.
66
67 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
68 will we ignored."""
69 while True :
70 if self.line :
71 line = self.line
72 self.line = ""
73 else :
74 line = self.handle.readline()
75 if not line :
76 if self.debug : print "End of file"
77 return None
78 if line[:self.HEADER_WIDTH]==self.RECORD_START :
79 if self.debug > 1: print "Found the start of a record:\n" + line
80 break
81 line = line.rstrip()
82 if line == "//" :
83 if self.debug > 1: print "Skipping // marking end of last record"
84 elif line == "" :
85 if self.debug > 1: print "Skipping blank line before record"
86 else :
87
88 if self.debug > 1:
89 print "Skipping header line before record:\n" + line
90 self.line = line
91 return line
92
94 """Return list of strings making up the header
95
96 New line characters are removed.
97
98 Assumes you have just read in the ID/LOCUS line.
99 """
100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
101 "Not at start of record"
102
103 header_lines = []
104 while True :
105 line = self.handle.readline()
106 if not line :
107 raise ValueError("Premature end of line during sequence data")
108 line = line.rstrip()
109 if line in self.FEATURE_START_MARKERS :
110 if self.debug : print "Found header table"
111 break
112
113
114
115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS :
116 if self.debug : print "Found start of sequence"
117 break
118 if line == "//" :
119 raise ValueError("Premature end of sequence data marker '//' found")
120 header_lines.append(line)
121 self.line = line
122 return header_lines
123
175
177 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
178
179 For example given this GenBank feature:
180
181 CDS complement(join(490883..490885,1..879))
182 /locus_tag="NEQ001"
183 /note="conserved hypothetical [Methanococcus jannaschii];
184 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
185 localization signal; IPR002743: Protein of unknown
186 function DUF57"
187 /codon_start=1
188 /transl_table=11
189 /product="hypothetical protein"
190 /protein_id="NP_963295.1"
191 /db_xref="GI:41614797"
192 /db_xref="GeneID:2732620"
193 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
194 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
195 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
196 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
197 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
198 LNSMGFGFVNTKKNSAR"
199
200 Then should give input key="CDS" and the rest of the data as a list of strings
201 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
202 where the leading spaces and trailing newlines have been removed.
203
204 Returns tuple containing: (key as string, location string, qualifiers as list)
205 as follows for this example:
206
207 key = "CDS", string
208 location = "complement(join(490883..490885,1..879))", string
209 qualifiers = list of string tuples:
210
211 [('locus_tag', '"NEQ001"'),
212 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
213 ('codon_start', '1'),
214 ('transl_table', '11'),
215 ('product', '"hypothetical protein"'),
216 ('protein_id', '"NP_963295.1"'),
217 ('db_xref', '"GI:41614797"'),
218 ('db_xref', '"GeneID:2732620"'),
219 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
220
221 In the above example, the "note" and "translation" were edited for compactness,
222 and they would contain multiple new line characters (displayed above as \n)
223
224 If a qualifier is quoted (in this case, everything except codon_start and
225 transl_table) then the quotes are NOT removed.
226
227 Note that no whitespace is removed.
228 """
229
230 iterator = iter(filter(None, lines))
231 try :
232 line = iterator.next()
233
234 feature_location = line.strip()
235 while feature_location[-1:]=="," :
236
237 feature_location += iterator.next().strip()
238
239 qualifiers=[]
240
241 for line in iterator :
242 if line[0]=="/" :
243
244 i = line.find("=")
245 key = line[1:i]
246 value = line[i+1:]
247 if i==-1 :
248
249 key = line[1:]
250 qualifiers.append((key,None))
251 elif value[0]=='"' :
252
253 if value[-1]!='"' or value!='"' :
254
255 while value[-1] != '"' :
256 value += "\n" + iterator.next()
257 else :
258
259 assert value == '"'
260 if self.debug : print "Quoted line %s:%s" % (key, value)
261
262 qualifiers.append((key,value))
263 else :
264
265
266 qualifiers.append((key,value))
267 else :
268
269 assert len(qualifiers) > 0
270 assert key==qualifiers[-1][0]
271
272 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
273 return (feature_key, feature_location, qualifiers)
274 except StopIteration:
275
276 raise ValueError("Problem with '%s' feature:\n%s" \
277 % (feature_key, "\n".join(lines)))
278
299
301 """Handle the LOCUS/ID line, passing data to the comsumer
302
303 This should be implemented by the EMBL / GenBank specific subclass
304
305 Used by the parse_records() and parse() methods.
306 """
307 pass
308
310 """Handle the header lines (list of strings), passing data to the comsumer
311
312 This should be implemented by the EMBL / GenBank specific subclass
313
314 Used by the parse_records() and parse() methods.
315 """
316 pass
317
318
332
334 """Handle any lines between features and sequence (list of strings), passing data to the consumer
335
336 This should be implemented by the EMBL / GenBank specific subclass
337
338 Used by the parse_records() and parse() methods.
339 """
340 pass
341
342 - def feed(self, handle, consumer, do_features=True) :
343 """Feed a set of data into the consumer.
344
345 This method is intended for use with the "old" code in Bio.GenBank
346
347 Arguments:
348 handle - A handle with the information to parse.
349 consumer - The consumer that should be informed of events.
350 do_features - Boolean, should the features be parsed?
351 Skipping the features can be much faster.
352
353 Return values:
354 true - Passed a record
355 false - Did not find a record
356 """
357
358
359 self.set_handle(handle)
360 if not self.find_start() :
361
362 consumer.data=None
363 return False
364
365
366
367
368
369
370 self._feed_first_line(consumer, self.line)
371 self._feed_header_lines(consumer, self.parse_header())
372
373
374 if do_features :
375 self._feed_feature_table(consumer, self.parse_features(skip=False))
376 else :
377 self.parse_features(skip=True)
378
379
380 misc_lines, sequence_string = self.parse_footer()
381 self._feed_misc_lines(consumer, misc_lines)
382
383 consumer.sequence(sequence_string)
384
385 consumer.record_end("//")
386
387 assert self.line == "//"
388
389
390 return True
391
392 - def parse(self, handle, do_features=True) :
407
408
410 """Returns a SeqRecord object iterator
411
412 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
413
414 The SeqRecord objects include SeqFeatures if do_features=True
415
416 This method is intended for use in Bio.SeqIO
417 """
418
419 while True :
420 record = self.parse(handle, do_features)
421 if record is None : break
422 assert record.id is not None
423 assert record.name != "<unknown name>"
424 assert record.description != "<unknown description>"
425 yield record
426
430 """Returns SeqRecord object iterator
431
432 Each CDS feature becomes a SeqRecord.
433
434 alphabet - Used for any sequence found in a translation field.
435 tags2id - Tupple of three strings, the feature keys to use
436 for the record id, name and description,
437
438 This method is intended for use in Bio.SeqIO
439 """
440 self.set_handle(handle)
441 while self.find_start() :
442
443 self.parse_header()
444 feature_tuples = self.parse_features()
445
446 while True :
447 line = self.handle.readline()
448 if not line : break
449 if line[:2]=="//" : break
450 self.line = line.rstrip()
451
452
453 for key, location_string, qualifiers in feature_tuples :
454 if key=="CDS" :
455
456
457
458
459
460 record = SeqRecord(seq=None)
461 annotations = record.annotations
462
463
464
465
466 annotations['raw_location'] = location_string.replace(' ','')
467
468 for (qualifier_name, qualifier_data) in qualifiers :
469 if qualifier_data is not None \
470 and qualifier_data[0]=='"' and qualifier_data[-1]=='"' :
471
472 qualifier_data = qualifier_data[1:-1]
473
474 if qualifier_name == "translation" :
475 assert record.seq is None, "Multiple translations!"
476 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
477 elif qualifier_name == "db_xref" :
478
479 record.dbxrefs.append(qualifier_data)
480 else :
481 if qualifier_data is not None :
482 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
483 try :
484 annotations[qualifier_name] += " " + qualifier_data
485 except KeyError :
486
487 annotations[qualifier_name]= qualifier_data
488
489
490
491 try :
492 record.id = annotations[tags2id[0]]
493 except KeyError :
494 pass
495 try :
496 record.name = annotations[tags2id[1]]
497 except KeyError :
498 pass
499 try :
500 record.description = annotations[tags2id[2]]
501 except KeyError :
502 pass
503
504 yield record
505
507 """For extracting chunks of information in EMBL files"""
508
509 RECORD_START = "ID "
510 HEADER_WIDTH = 5
511 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
512 FEATURE_END_MARKERS = ["XX"]
513 FEATURE_QUALIFIER_INDENT = 21
514 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
515 SEQUENCE_HEADERS=["SQ"]
516
550
561
563
564
565
566 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
567 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
568 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
569 fields = [entry.strip() for entry in fields]
570 """
571 The tokens represent:
572 0. Primary accession number
573 (space sep)
574 1. ??? (e.g. standard)
575 (semi-colon)
576 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
577 3. Taxonomic division (e.g. 'PRO')
578 4. Sequence length (e.g. '4639675 BP.')
579 """
580 consumer.locus(fields[0])
581 consumer.residue_type(fields[2])
582 consumer.data_file_division(fields[3])
583 self._feed_seq_length(consumer, fields[4])
584
586
587
588
589 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
590 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
591 assert len(fields) == 7
592 """
593 The tokens represent:
594 0. Primary accession number
595 1. Sequence version number
596 2. Topology: 'circular' or 'linear'
597 3. Molecule type (e.g. 'genomic DNA')
598 4. Data class (e.g. 'STD')
599 5. Taxonomic division (e.g. 'PRO')
600 6. Sequence length (e.g. '4639675 BP.')
601 """
602
603 consumer.locus(fields[0])
604
605
606
607 consumer.accession(fields[0])
608
609
610
611 version_parts = fields[1].split()
612 if len(version_parts)==2 \
613 and version_parts[0]=="SV" \
614 and version_parts[1].isdigit() :
615 consumer.version_suffix(version_parts[1])
616
617
618 consumer.residue_type(" ".join(fields[2:4]))
619
620
621
622 consumer.data_file_division(fields[5])
623
624 self._feed_seq_length(consumer, fields[6])
625
627 length_parts = text.split()
628 assert len(length_parts) == 2
629 assert length_parts[1].upper() in ["BP", "BP."]
630 consumer.size(length_parts[0])
631
633 EMBL_INDENT = self.HEADER_WIDTH
634 EMBL_SPACER = " " * EMBL_INDENT
635 consumer_dict = {
636 'AC' : 'accession',
637 'SV' : 'version',
638 'DE' : 'definition',
639
640
641
642 'RA' : 'authors',
643 'RT' : 'title',
644 'RL' : 'journal',
645 'OS' : 'organism',
646 'OC' : 'taxonomy',
647
648 'CC' : 'comment',
649
650 }
651
652
653 lines = filter(None,lines)
654 line_iter = iter(lines)
655 try :
656 while True :
657 try :
658 line = line_iter.next()
659 except StopIteration :
660 break
661 if not line : break
662 line_type = line[:EMBL_INDENT].strip()
663 data = line[EMBL_INDENT:].strip()
664
665 if line_type == 'XX' :
666 pass
667 elif line_type == 'RN' :
668
669
670 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
671 consumer.reference_num(data)
672 elif line_type == 'RP' :
673
674
675 assert data.count("-")==1
676 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")")
677 elif line_type == 'RX' :
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693 key, value = data.split(";",1)
694 if value.endswith(".") : value = value[:-1]
695 value = value.strip()
696 if key == "PUBMED" :
697 consumer.pubmed_id(value)
698
699 elif line_type == 'CC' :
700
701 consumer.comment([data])
702 elif line_type == 'DR' :
703
704
705
706
707
708
709
710
711 pass
712 elif line_type in consumer_dict :
713
714 getattr(consumer, consumer_dict[line_type])(data)
715 else :
716 if self.debug :
717 print "Ignoring EMBL header line:\n%s" % line
718 except StopIteration :
719 raise ValueError("Problem with header")
720
724
726 """For extracting chunks of information in GenBank files"""
727
728 RECORD_START = "LOCUS "
729 HEADER_WIDTH = 12
730 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
731 FEATURE_END_MARKERS = []
732 FEATURE_QUALIFIER_INDENT = 21
733 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
734 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
735
778
780
781
782
783 GENBANK_INDENT = self.HEADER_WIDTH
784 GENBANK_SPACER = " "*GENBANK_INDENT
785 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
786 'LOCUS line does not start correctly:\n' + line
787
788
789
790 if line[29:33] in [' bp ', ' aa ',' rc '] :
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809 assert line[29:33] in [' bp ', ' aa ',' rc '] , \
810 'LOCUS line does not contain size units at expected position:\n' + line
811 assert line[41:42] == ' ', \
812 'LOCUS line does not contain space at position 42:\n' + line
813 assert line[42:51].strip() in ['','linear','circular'], \
814 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
815 assert line[51:52] == ' ', \
816 'LOCUS line does not contain space at position 52:\n' + line
817 assert line[55:62] == ' ', \
818 'LOCUS line does not contain spaces from position 56 to 62:\n' + line
819 if line[62:73].strip() :
820 assert line[64:65] == '-', \
821 'LOCUS line does not contain - at position 65 in date:\n' + line
822 assert line[68:69] == '-', \
823 'LOCUS line does not contain - at position 69 in date:\n' + line
824
825 name_and_length_str = line[GENBANK_INDENT:29]
826 while name_and_length_str.find(' ')!=-1 :
827 name_and_length_str = name_and_length_str.replace(' ',' ')
828 name_and_length = name_and_length_str.split(' ')
829 assert len(name_and_length)<=2, \
830 'Cannot parse the name and length in the LOCUS line:\n' + line
831 assert len(name_and_length)!=1, \
832 'Name and length collide in the LOCUS line:\n' + line
833
834
835
836 consumer.locus(name_and_length[0])
837 consumer.size(name_and_length[1])
838
839
840 if line[33:51].strip() == "" and line[29:33] == ' aa ' :
841
842
843
844
845 consumer.residue_type("PROTEIN")
846 else :
847 consumer.residue_type(line[33:51].strip())
848
849 consumer.data_file_division(line[52:55])
850 if line[62:73].strip() :
851 consumer.date(line[62:73])
852 elif line[40:44] in [' bp ', ' aa ',' rc '] :
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
873 'LOCUS line does not contain size units at expected position:\n' + line
874 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
875 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
876 assert line[47:54].strip() == "" \
877 or line[47:54].strip().find('DNA') != -1 \
878 or line[47:54].strip().find('RNA') != -1, \
879 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
880 assert line[54:55] == ' ', \
881 'LOCUS line does not contain space at position 55:\n' + line
882 assert line[55:63].strip() in ['','linear','circular'], \
883 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
884 assert line[63:64] == ' ', \
885 'LOCUS line does not contain space at position 64:\n' + line
886 assert line[67:68] == ' ', \
887 'LOCUS line does not contain space at position 68:\n' + line
888 if line[68:79].strip() :
889 assert line[70:71] == '-', \
890 'LOCUS line does not contain - at position 71 in date:\n' + line
891 assert line[74:75] == '-', \
892 'LOCUS line does not contain - at position 75 in date:\n' + line
893
894 name_and_length_str = line[GENBANK_INDENT:40]
895 while name_and_length_str.find(' ')!=-1 :
896 name_and_length_str = name_and_length_str.replace(' ',' ')
897 name_and_length = name_and_length_str.split(' ')
898 assert len(name_and_length)<=2, \
899 'Cannot parse the name and length in the LOCUS line:\n' + line
900 assert len(name_and_length)!=1, \
901 'Name and length collide in the LOCUS line:\n' + line
902
903
904
905 consumer.locus(name_and_length[0])
906 consumer.size(name_and_length[1])
907
908 if line[44:54].strip() == "" and line[40:44] == ' aa ' :
909
910
911
912
913 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
914 else :
915 consumer.residue_type(line[44:63].strip())
916
917 consumer.data_file_division(line[64:67])
918 if line[68:79].strip() :
919 consumer.date(line[68:79])
920 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936 if line[GENBANK_INDENT:].strip() != "" :
937 consumer.locus(line[GENBANK_INDENT:].strip())
938 else :
939
940
941 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line)
942 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"] :
943
944
945 consumer.locus(line.split()[1])
946 consumer.size(line.split()[2])
947 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line)
948 else :
949 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
950
951
953
954
955
956
957 GENBANK_INDENT = self.HEADER_WIDTH
958 GENBANK_SPACER = " "*GENBANK_INDENT
959 consumer_dict = {
960 'DEFINITION' : 'definition',
961 'ACCESSION' : 'accession',
962 'NID' : 'nid',
963 'PID' : 'pid',
964 'DBSOURCE' : 'db_source',
965 'KEYWORDS' : 'keywords',
966 'SEGMENT' : 'segment',
967 'SOURCE' : 'source',
968 'AUTHORS' : 'authors',
969 'CONSRTM' : 'consrtm',
970 'PROJECT' : 'project',
971 'DBLINK' : 'dblink',
972 'TITLE' : 'title',
973 'JOURNAL' : 'journal',
974 'MEDLINE' : 'medline_id',
975 'PUBMED' : 'pubmed_id',
976 'REMARK' : 'remark'}
977
978
979
980
981
982
983 lines = filter(None,lines)
984 lines.append("")
985 line_iter = iter(lines)
986 try :
987 line = line_iter.next()
988 while True :
989 if not line : break
990 line_type = line[:GENBANK_INDENT].strip()
991 data = line[GENBANK_INDENT:].strip()
992
993 if line_type == 'VERSION' :
994
995
996
997 while data.find(' ')!=-1:
998 data = data.replace(' ',' ')
999 if data.find(' GI:')==-1 :
1000 consumer.version(data)
1001 else :
1002 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1003 consumer.version(data.split(' GI:')[0])
1004 consumer.gi(data.split(' GI:')[1])
1005
1006 line = line_iter.next()
1007 elif line_type == 'REFERENCE' :
1008 if self.debug >1 : print "Found reference [" + data + "]"
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019 data = data.strip()
1020
1021
1022 while True:
1023 line = line_iter.next()
1024 if line[:GENBANK_INDENT] == GENBANK_SPACER :
1025
1026 data += " " + line[GENBANK_INDENT:]
1027 if self.debug >1 : print "Extended reference text [" + data + "]"
1028 else :
1029
1030 break
1031
1032
1033
1034 while data.find(' ')!=-1:
1035 data = data.replace(' ',' ')
1036 if data.find(' ')==-1 :
1037 if self.debug >2 : print 'Reference number \"' + data + '\"'
1038 consumer.reference_num(data)
1039 else :
1040 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1041 consumer.reference_num(data[:data.find(' ')])
1042 consumer.reference_bases(data[data.find(' ')+1:])
1043 elif line_type == 'ORGANISM' :
1044
1045
1046
1047
1048
1049
1050
1051
1052 organism_data = data
1053 lineage_data = ""
1054 while True :
1055 line = line_iter.next()
1056 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1057 if lineage_data or ";" in line :
1058 lineage_data += " " + line[GENBANK_INDENT:]
1059 else :
1060 organism_data += " " + line[GENBANK_INDENT:].strip()
1061 else :
1062
1063 break
1064 consumer.organism(organism_data)
1065 if lineage_data.strip() == "" and self.debug > 1 :
1066 print "Taxonomy line(s) missing or blank"
1067 consumer.taxonomy(lineage_data.strip())
1068 del organism_data, lineage_data
1069 elif line_type == 'COMMENT' :
1070 if self.debug > 1 : print "Found comment"
1071
1072
1073 comment_list=[]
1074 comment_list.append(data)
1075 while True:
1076 line = line_iter.next()
1077 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1078 data = line[GENBANK_INDENT:]
1079 comment_list.append(data)
1080 if self.debug > 2 : print "Comment continuation [" + data + "]"
1081 else :
1082
1083 break
1084 consumer.comment(comment_list)
1085 del comment_list
1086 elif line_type in consumer_dict :
1087
1088
1089 while True :
1090 line = line_iter.next()
1091 if line[0:GENBANK_INDENT] == GENBANK_SPACER :
1092 data += ' ' + line[GENBANK_INDENT:]
1093 else :
1094
1095 getattr(consumer, consumer_dict[line_type])(data)
1096
1097 break
1098 else :
1099 if self.debug :
1100 print "Ignoring GenBank header line:\n" % line
1101
1102 line = line_iter.next()
1103 except StopIteration :
1104 raise ValueError("Problem in header")
1105
1146
1147 if __name__ == "__main__" :
1148 from StringIO import StringIO
1149
1150 gbk_example = \
1151 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1152 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1153 (AXL2) and Rev7p (REV7) genes, complete cds.
1154 ACCESSION U49845
1155 VERSION U49845.1 GI:1293613
1156 KEYWORDS .
1157 SOURCE Saccharomyces cerevisiae (baker's yeast)
1158 ORGANISM Saccharomyces cerevisiae
1159 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1160 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1161 REFERENCE 1 (bases 1 to 5028)
1162 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1163 TITLE Cloning and sequence of REV7, a gene whose function is required for
1164 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1165 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1166 PUBMED 7871890
1167 REFERENCE 2 (bases 1 to 5028)
1168 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1169 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1170 plasma membrane glycoprotein
1171 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1172 PUBMED 8846915
1173 REFERENCE 3 (bases 1 to 5028)
1174 AUTHORS Roemer,T.
1175 TITLE Direct Submission
1176 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1177 Haven, CT, USA
1178 FEATURES Location/Qualifiers
1179 source 1..5028
1180 /organism="Saccharomyces cerevisiae"
1181 /db_xref="taxon:4932"
1182 /chromosome="IX"
1183 /map="9"
1184 CDS <1..206
1185 /codon_start=3
1186 /product="TCP1-beta"
1187 /protein_id="AAA98665.1"
1188 /db_xref="GI:1293614"
1189 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1190 AEVLLRVDNIIRARPRTANRQHM"
1191 gene 687..3158
1192 /gene="AXL2"
1193 CDS 687..3158
1194 /gene="AXL2"
1195 /note="plasma membrane glycoprotein"
1196 /codon_start=1
1197 /function="required for axial budding pattern of S.
1198 cerevisiae"
1199 /product="Axl2p"
1200 /protein_id="AAA98666.1"
1201 /db_xref="GI:1293615"
1202 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1203 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1204 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1205 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1206 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1207 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1208 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1209 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1210 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1211 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1212 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1213 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1214 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1215 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1216 VDFSNKSNVNVGQVKDIHGRIPEML"
1217 gene complement(3300..4037)
1218 /gene="REV7"
1219 CDS complement(3300..4037)
1220 /gene="REV7"
1221 /codon_start=1
1222 /product="Rev7p"
1223 /protein_id="AAA98667.1"
1224 /db_xref="GI:1293616"
1225 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1226 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1227 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1228 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1229 LISGDDKILNGVYSQYEEGESIFGSLF"
1230 ORIGIN
1231 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1232 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1233 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1234 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1235 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1236 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1237 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1238 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1239 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1240 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1241 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1242 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1243 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1244 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1245 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1246 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1247 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1248 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1249 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1250 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1251 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1252 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1253 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1254 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1255 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1256 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1257 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1258 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1259 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1260 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1261 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1262 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1263 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1264 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1265 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1266 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1267 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1268 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1269 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1270 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1271 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1272 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1273 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1274 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1275 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1276 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1277 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1278 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1279 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1280 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1281 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1282 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1283 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1284 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1285 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1286 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1287 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1288 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1289 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1290 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1291 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1292 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1293 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1294 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1295 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1296 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1297 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1298 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1299 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1300 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1301 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1302 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1303 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1304 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1305 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1306 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1307 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1308 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1309 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1310 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1311 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1312 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1313 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1314 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1315 //"""
1316
1317
1318
1319 gbk_example2 = \
1320 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1321 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1322 ACCESSION AAD51968
1323 VERSION AAD51968.1 GI:5805369
1324 DBSOURCE locus AF171097 accession AF171097.1
1325 KEYWORDS .
1326 SOURCE Yersinia enterocolitica
1327 ORGANISM Yersinia enterocolitica
1328 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1329 Enterobacteriaceae; Yersinia.
1330 REFERENCE 1 (residues 1 to 143)
1331 AUTHORS Revell,P.A. and Miller,V.L.
1332 TITLE A chromosomally encoded regulator is required for expression of the
1333 Yersinia enterocolitica inv gene and for virulence
1334 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1335 MEDLINE 20138369
1336 PUBMED 10672189
1337 REFERENCE 2 (residues 1 to 143)
1338 AUTHORS Revell,P.A. and Miller,V.L.
1339 TITLE Direct Submission
1340 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1341 University School of Medicine, Campus Box 8230, 660 South Euclid,
1342 St. Louis, MO 63110, USA
1343 COMMENT Method: conceptual translation.
1344 FEATURES Location/Qualifiers
1345 source 1..143
1346 /organism="Yersinia enterocolitica"
1347 /mol_type="unassigned DNA"
1348 /strain="JB580v"
1349 /serotype="O:8"
1350 /db_xref="taxon:630"
1351 Protein 1..143
1352 /product="transcriptional regulator RovA"
1353 /name="regulates inv expression"
1354 CDS 1..143
1355 /gene="rovA"
1356 /coded_by="AF171097.1:380..811"
1357 /note="regulator of virulence"
1358 /transl_table=11
1359 ORIGIN
1360 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1361 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1362 121 deiellsgli dklerniiql qsk
1363 //
1364 """
1365
1366 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1367 XX
1368 AC X56734; S46826;
1369 XX
1370 DT 12-SEP-1991 (Rel. 29, Created)
1371 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1372 XX
1373 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1374 XX
1375 KW beta-glucosidase.
1376 XX
1377 OS Trifolium repens (white clover)
1378 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1379 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1380 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1381 XX
1382 RN [5]
1383 RP 1-1859
1384 RX PUBMED; 1907511.
1385 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1386 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1387 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1388 RL Plant Mol. Biol. 17(2):209-219(1991).
1389 XX
1390 RN [6]
1391 RP 1-1859
1392 RA Hughes M.A.;
1393 RT ;
1394 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1395 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1396 RL Upon Tyne, NE2 4HH, UK
1397 XX
1398 FH Key Location/Qualifiers
1399 FH
1400 FT source 1..1859
1401 FT /organism="Trifolium repens"
1402 FT /mol_type="mRNA"
1403 FT /clone_lib="lambda gt10"
1404 FT /clone="TRE361"
1405 FT /tissue_type="leaves"
1406 FT /db_xref="taxon:3899"
1407 FT CDS 14..1495
1408 FT /product="beta-glucosidase"
1409 FT /EC_number="3.2.1.21"
1410 FT /note="non-cyanogenic"
1411 FT /db_xref="GOA:P26204"
1412 FT /db_xref="InterPro:IPR001360"
1413 FT /db_xref="InterPro:IPR013781"
1414 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1415 FT /protein_id="CAA40058.1"
1416 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1417 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1418 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1419 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1420 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1421 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1422 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1423 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1424 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1425 FT mRNA 1..1859
1426 FT /experiment="experimental evidence, no additional details
1427 FT recorded"
1428 XX
1429 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1430 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1431 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1432 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1433 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1434 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1435 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1436 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1437 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1438 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1439 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1440 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1441 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1442 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1443 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1444 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1445 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1446 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1447 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1448 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1449 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1450 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1451 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1452 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1453 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1454 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1455 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1456 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1457 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1458 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1459 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1460 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1461 //
1462 """
1463
1464 print "GenBank CDS Iteration"
1465 print "====================="
1466
1467 g = GenBankScanner()
1468 for record in g.parse_cds_features(StringIO(gbk_example)) :
1469 print record
1470
1471 g = GenBankScanner()
1472 for record in g.parse_cds_features(StringIO(gbk_example2),
1473 tags2id=('gene','locus_tag','product')) :
1474 print record
1475
1476 g = GenBankScanner()
1477 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1478 tags2id=('gene','locus_tag','product')) :
1479 print record
1480
1481 print
1482 print "GenBank Iteration"
1483 print "================="
1484 g = GenBankScanner()
1485 for record in g.parse_records(StringIO(gbk_example),do_features=False) :
1486 print record.id, record.name, record.description
1487 print record.seq
1488
1489 g = GenBankScanner()
1490 for record in g.parse_records(StringIO(gbk_example),do_features=True) :
1491 print record.id, record.name, record.description
1492 print record.seq
1493
1494 g = GenBankScanner()
1495 for record in g.parse_records(StringIO(gbk_example2),do_features=False) :
1496 print record.id, record.name, record.description
1497 print record.seq
1498
1499 g = GenBankScanner()
1500 for record in g.parse_records(StringIO(gbk_example2),do_features=True) :
1501 print record.id, record.name, record.description
1502 print record.seq
1503
1504 print
1505 print "EMBL CDS Iteration"
1506 print "=================="
1507
1508 e = EmblScanner()
1509 for record in e.parse_cds_features(StringIO(embl_example)) :
1510 print record
1511
1512 print
1513 print "EMBL Iteration"
1514 print "=============="
1515 e = EmblScanner()
1516 for record in e.parse_records(StringIO(embl_example),do_features=True) :
1517 print record.id, record.name, record.description
1518 print record.seq
1519