1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 ErrorFeatureParser Catch errors caused during parsing.
25 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
26 RecordParser Parse GenBank data into a Record object.
27 NCBIDictionary Access GenBank using a dictionary interface (DEPRECATED).
28
29 _BaseGenBankConsumer A base class for GenBank consumer that implements
30 some helpful functions that are in common between
31 consumers.
32 _FeatureConsumer Create SeqFeature objects from info generated by
33 the Scanner
34 _RecordConsumer Create a GenBank record object from Scanner info.
35 _PrintingConsumer A debugging consumer.
36
37 ParserFailureError Exception indicating a failure in the parser (ie.
38 scanner or consumer)
39 LocationParserError Exception indiciating a problem with the spark based
40 location parser.
41
42 Functions:
43 search_for Do a query against GenBank (DEPRECATED).
44 download_many Download many GenBank records (DEPRECATED).
45
46 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records.
47 These are GenBank files that summarize the content of a project, and provide lists of
48 scaffold and contig files in the project. These will be in annotations['wgs'] and
49 annotations['wgs_scafld']. These GenBank files do not have sequences. See
50 http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36
51
52 http://is.gd/nNgk
53 for more details of this format, and an example.
54 Added by Ying Huang & Iddo Friedberg
55 """
56 import cStringIO
57
58
59 from Bio import SeqFeature
60 from Bio.ParserSupport import AbstractConsumer
61 from Bio import Entrez
62
63
64 import LocationParser
65 from utils import FeatureValueCleaner
66 from Scanner import GenBankScanner
67
68
69 GENBANK_INDENT = 12
70 GENBANK_SPACER = " " * GENBANK_INDENT
71
72
73 FEATURE_KEY_INDENT = 5
74 FEATURE_QUALIFIER_INDENT = 21
75 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
76 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
77
79 """Iterator interface to move over a file of GenBank entries one at a time.
80 """
81 - def __init__(self, handle, parser = None):
82 """Initialize the iterator.
83
84 Arguments:
85 o handle - A handle with GenBank entries to iterate through.
86 o parser - An optional parser to pass the entries through before
87 returning them. If None, then the raw entry will be returned.
88 """
89 self.handle = handle
90 self._parser = parser
91
93 """Return the next GenBank record from the handle.
94
95 Will return None if we ran out of records.
96 """
97 if self._parser is None :
98 lines = []
99 while True :
100 line = self.handle.readline()
101 if not line : return None
102 lines.append(line)
103 if line.rstrip() == "//" : break
104 return "".join(lines)
105 try :
106 return self._parser.parse(self.handle)
107 except StopIteration :
108 return None
109
111 return iter(self.next, None)
112
114 """Failure caused by some kind of problem in the parser.
115 """
116 pass
117
119 """Could not Properly parse out a location from a GenBank file.
120 """
121 pass
122
124 """Parse GenBank files into Seq + Feature objects.
125 """
128 """Initialize a GenBank parser and Feature consumer.
129
130 Arguments:
131 o debug_level - An optional argument that species the amount of
132 debugging information the parser should spit out. By default we have
133 no debugging info (the fastest way to do things), but if you want
134 you can set this as high as two and see exactly where a parse fails.
135 o use_fuzziness - Specify whether or not to use fuzzy representations.
136 The default is 1 (use fuzziness).
137 o feature_cleaner - A class which will be used to clean out the
138 values of features. This class must implement the function
139 clean_value. GenBank.utils has a "standard" cleaner class, which
140 is used by default.
141 """
142 self._scanner = GenBankScanner(debug_level)
143 self.use_fuzziness = use_fuzziness
144 self._cleaner = feature_cleaner
145
146 - def parse(self, handle):
147 """Parse the specified handle.
148 """
149 self._consumer = _FeatureConsumer(self.use_fuzziness,
150 self._cleaner)
151 self._scanner.feed(handle, self._consumer)
152 return self._consumer.data
153
155 """Parse GenBank files into Record objects
156 """
158 """Initialize the parser.
159
160 Arguments:
161 o debug_level - An optional argument that species the amount of
162 debugging information the parser should spit out. By default we have
163 no debugging info (the fastest way to do things), but if you want
164 you can set this as high as two and see exactly where a parse fails.
165 """
166 self._scanner = GenBankScanner(debug_level)
167
168 - def parse(self, handle):
169 """Parse the specified handle into a GenBank record.
170 """
171 self._consumer = _RecordConsumer()
172 self._scanner.feed(handle, self._consumer)
173 return self._consumer.data
174
176 """Abstract GenBank consumer providing useful general functions.
177
178 This just helps to eliminate some duplication in things that most
179 GenBank consumers want to do.
180 """
181
182
183
184
185 remove_space_keys = ["translation"]
186
189
191 """Split a string of keywords into a nice clean list.
192 """
193
194 if keyword_string == "" or keyword_string == "." :
195 keywords = ""
196 elif keyword_string[-1] == '.':
197 keywords = keyword_string[:-1]
198 else:
199 keywords = keyword_string
200 keyword_list = keywords.split(';')
201 clean_keyword_list = [x.strip() for x in keyword_list]
202 return clean_keyword_list
203
205 """Split a string of accession numbers into a list.
206 """
207
208
209 accession = accession_string.replace("\n", " ").replace(";"," ")
210
211 return [x.strip() for x in accession.split() if x.strip()]
212
214 """Split a string with taxonomy info into a list.
215 """
216 if not taxonomy_string or taxonomy_string=="." :
217
218 return []
219
220 if taxonomy_string[-1] == '.':
221 tax_info = taxonomy_string[:-1]
222 else:
223 tax_info = taxonomy_string
224 tax_list = tax_info.split(';')
225 new_tax_list = []
226 for tax_item in tax_list:
227 new_items = tax_item.split("\n")
228 new_tax_list.extend(new_items)
229 while '' in new_tax_list:
230 new_tax_list.remove('')
231 clean_tax_list = [x.strip() for x in new_tax_list]
232
233 return clean_tax_list
234
236 """Clean whitespace out of a location string.
237
238 The location parser isn't a fan of whitespace, so we clean it out
239 before feeding it into the parser.
240 """
241
242
243
244 return ''.join(location_string.split())
245
247 """Remove any newlines in the passed text, returning the new string.
248 """
249
250 newlines = ["\n", "\r"]
251 for ws in newlines:
252 text = text.replace(ws, "")
253
254 return text
255
257 """Replace multiple spaces in the passed text with single spaces.
258 """
259
260 text_parts = text.split(" ")
261 text_parts = filter(None, text_parts)
262 return ' '.join(text_parts)
263
265 """Remove all spaces from the passed text.
266 """
267 return text.replace(" ", "")
268
270 """Convert a start and end range to python notation.
271
272 In GenBank, starts and ends are defined in "biological" coordinates,
273 where 1 is the first base and [i, j] means to include both i and j.
274
275 In python, 0 is the first base and [i, j] means to include i, but
276 not j.
277
278 So, to convert "biological" to python coordinates, we need to
279 subtract 1 from the start, and leave the end and things should
280 be converted happily.
281 """
282 new_start = start - 1
283 new_end = end
284
285 return new_start, new_end
286
288 """Create a SeqRecord object with Features to return.
289
290 Attributes:
291 o use_fuzziness - specify whether or not to parse with fuzziness in
292 feature locations.
293 o feature_cleaner - a class that will be used to provide specialized
294 cleaning-up of feature values.
295 """
296 - def __init__(self, use_fuzziness, feature_cleaner = None):
297 from Bio.SeqRecord import SeqRecord
298 _BaseGenBankConsumer.__init__(self)
299 self.data = SeqRecord(None, id = None)
300 self.data.id = None
301 self.data.description = ""
302
303 self._use_fuzziness = use_fuzziness
304 self._feature_cleaner = feature_cleaner
305
306 self._seq_type = ''
307 self._seq_data = []
308 self._current_ref = None
309 self._cur_feature = None
310 self._cur_qualifier_key = None
311 self._cur_qualifier_value = None
312 self._expected_size = None
313
314 - def locus(self, locus_name):
315 """Set the locus name is set as the name of the Sequence.
316 """
317 self.data.name = locus_name
318
319 - def size(self, content):
320 """Record the sequence length."""
321 self._expected_size = int(content)
322
324 """Record the sequence type so we can choose an appropriate alphabet.
325 """
326 self._seq_type = type
327
330
331 - def date(self, submit_date):
333
343
345 """Set the accession number as the id of the sequence.
346
347 If we have multiple accession numbers, the first one passed is
348 used.
349 """
350 new_acc_nums = self._split_accessions(acc_num)
351
352
353 try :
354
355 for acc in new_acc_nums :
356
357 if acc not in self.data.annotations['accessions'] :
358 self.data.annotations['accessions'].append(acc)
359 except KeyError :
360 self.data.annotations['accessions'] = new_acc_nums
361
362
363 if self.data.id is None:
364 if len(new_acc_nums) > 0:
365
366
367 self.data.id = self.data.annotations['accessions'][0]
368
369 - def wgs(self, content):
371
374
375 - def nid(self, content):
377
378 - def pid(self, content):
380
393
395 """Handle the information from the PROJECT line as a list of projects.
396
397 e.g.
398 PROJECT GenomeProject:28471
399
400 or:
401 PROJECT GenomeProject:13543 GenomeProject:99999
402
403 This is stored as dbxrefs in the SeqRecord to be consistent with the
404 projected switch of this line to DBLINK in future GenBank versions.
405 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
406 "Project:28471" as part of this transition.
407 """
408 content = content.replace("GenomeProject:", "Project:")
409 self.data.dbxrefs.extend([p for p in content.split() if p])
410
412 """Store DBLINK cross references as dbxrefs in our record object.
413
414 This line type is expected to replace the PROJECT line in 2009. e.g.
415
416 During transition:
417
418 PROJECT GenomeProject:28471
419 DBLINK Project:28471
420 Trace Assembly Archive:123456
421
422 Once the project line is dropped:
423
424 DBLINK Project:28471
425 Trace Assembly Archive:123456
426
427 Note GenomeProject -> Project.
428
429 We'll have to see some real examples to be sure, but based on the
430 above example we can expect one reference per line.
431 """
432
433
434 if content.strip() not in self.data.dbxrefs :
435 self.data.dbxrefs.append(content.strip())
436
438 """Set the version to overwrite the id.
439
440 Since the verison provides the same information as the accession
441 number, plus some extra info, we set this as the id if we have
442 a version.
443 """
444
445
446
447
448
449
450
451
452
453
454 assert version.isdigit()
455 self.data.annotations['sequence_version'] = int(version)
456
459
460 - def gi(self, content):
462
465
468
470
471
472 if content == "" :
473 source_info = ""
474 elif content[-1] == '.':
475 source_info = content[:-1]
476 else:
477 source_info = content
478 self.data.annotations['source'] = source_info
479
482
491
493 """Signal the beginning of a new reference object.
494 """
495
496
497 if self._current_ref is not None:
498 self.data.annotations['references'].append(self._current_ref)
499 else:
500 self.data.annotations['references'] = []
501
502 self._current_ref = SeqFeature.Reference()
503
505 """Attempt to determine the sequence region the reference entails.
506
507 Possible types of information we may have to deal with:
508
509 (bases 1 to 86436)
510 (sites)
511 (bases 1 to 105654; 110423 to 111122)
512 1 (residues 1 to 182)
513 """
514
515 ref_base_info = content[1:-1]
516
517 all_locations = []
518
519 if ref_base_info.find('bases') != -1 and \
520 ref_base_info.find('to') != -1:
521
522 ref_base_info = ref_base_info[5:]
523 locations = self._split_reference_locations(ref_base_info)
524 all_locations.extend(locations)
525 elif (ref_base_info.find("residues") >= 0 and
526 ref_base_info.find("to") >= 0):
527 residues_start = ref_base_info.find("residues")
528
529 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
530 locations = self._split_reference_locations(ref_base_info)
531 all_locations.extend(locations)
532
533
534
535 elif (ref_base_info == 'sites' or
536 ref_base_info.strip() == 'bases'):
537 pass
538
539 else:
540 raise ValueError("Could not parse base info %s in record %s" %
541 (ref_base_info, self.data.id))
542
543 self._current_ref.location = all_locations
544
546 """Get reference locations out of a string of reference information
547
548 The passed string should be of the form:
549
550 1 to 20; 20 to 100
551
552 This splits the information out and returns a list of location objects
553 based on the reference locations.
554 """
555
556 all_base_info = location_string.split(';')
557
558 new_locations = []
559 for base_info in all_base_info:
560 start, end = base_info.split('to')
561 new_start, new_end = \
562 self._convert_to_python_numbers(int(start.strip()),
563 int(end.strip()))
564 this_location = SeqFeature.FeatureLocation(new_start, new_end)
565 new_locations.append(this_location)
566 return new_locations
567
569 if self._current_ref.authors :
570 self._current_ref.authors += ' ' + content
571 else :
572 self._current_ref.authors = content
573
575 if self._current_ref.consrtm :
576 self._current_ref.consrtm += ' ' + content
577 else :
578 self._current_ref.consrtm = content
579
580 - def title(self, content):
581 if self._current_ref.title :
582 self._current_ref.title += ' ' + content
583 else :
584 self._current_ref.title = content
585
587 if self._current_ref.journal :
588 self._current_ref.journal += ' ' + content
589 else :
590 self._current_ref.journal = content
591
594
597
599 """Deal with a reference comment."""
600 if self._current_ref.comment :
601 self._current_ref.comment += ' ' + content
602 else :
603 self._current_ref.comment = content
604
610
612 """Get ready for the feature table when we reach the FEATURE line.
613 """
614 self.start_feature_table()
615
617 """Indicate we've got to the start of the feature table.
618 """
619
620 if self._current_ref is not None:
621 self.data.annotations['references'].append(self._current_ref)
622 self._current_ref = None
623
625 """Utility function to add a feature to the SeqRecord.
626
627 This does all of the appropriate checking to make sure we haven't
628 left any info behind, and that we are only adding info if it
629 exists.
630 """
631 if self._cur_feature:
632
633
634 self._add_qualifier()
635
636 self._cur_qualifier_key = ''
637 self._cur_qualifier_value = ''
638 self.data.features.append(self._cur_feature)
639
653
655 """Parse out location information from the location string.
656
657 This uses a comprehensive but slow spark based parser to do the
658 parsing, and then translates the results of the parse into appropriate
659 Location objects.
660 """
661
662
663
664
665
666
667 location_line = self._clean_location(content)
668
669
670
671
672
673
674 if location_line.find('replace') != -1:
675 comma_pos = location_line.find(',')
676 location_line = location_line[8:comma_pos]
677
678
679 try:
680 parse_info = \
681 LocationParser.parse(LocationParser.scan(location_line))
682
683 except SystemExit:
684 raise LocationParserError(location_line)
685
686
687
688
689 self._set_location_info(parse_info, self._cur_feature)
690
692 """Set the location information based on a function.
693
694 This handles all of the location functions like 'join', 'complement'
695 and 'order'.
696
697 Arguments:
698 o function - A LocationParser.Function object specifying the
699 function we are acting on.
700 o cur_feature - The feature to add information to.
701 """
702 assert isinstance(function, LocationParser.Function), \
703 "Expected a Function object, got %s" % function
704
705 if function.name == "complement":
706
707 cur_feature.strand = -1
708
709 for inner_info in function.args:
710 self._set_location_info(inner_info, cur_feature)
711
712
713
714
715
716
717
718 elif (function.name == "join" or function.name == "order" or
719 function.name == "one-of" or function.name == "bond"):
720 self._set_ordering_info(function, cur_feature)
721 elif (function.name == "gap"):
722 assert len(function.args) == 1, \
723 "Unexpected number of arguments in gap %s" % function.args
724
725 position = self._get_position(function.args[0].local_location)
726 cur_feature.location = SeqFeature.PositionGap(position)
727 else:
728 raise ValueError("Unexpected function name: %s" % function.name)
729
731 """Parse a join or order and all of the information in it.
732
733 This deals with functions that order a bunch of locations,
734 specifically 'join' and 'order'. The inner locations are
735 added as subfeatures of the top level feature
736 """
737
738
739 cur_feature.location_operator = function.name
740 for inner_element in function.args:
741 new_sub_feature = SeqFeature.SeqFeature()
742
743 new_sub_feature.type = cur_feature.type
744
745 new_sub_feature.location_operator = function.name
746
747 new_sub_feature.ref = cur_feature.ref
748 new_sub_feature.ref_db = cur_feature.ref_db
749 new_sub_feature.strand = cur_feature.strand
750
751
752 self._set_location_info(inner_element, new_sub_feature)
753
754
755 cur_feature.sub_features.append(new_sub_feature)
756
757
758
759
760
761
762
763
764 feature_start = cur_feature.sub_features[0].location.start
765 feature_end = cur_feature.sub_features[-1].location.end
766 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
767 feature_end)
768
770 """Set the location information for a feature from the parse info.
771
772 Arguments:
773 o parse_info - The classes generated by the LocationParser.
774 o cur_feature - The feature to add the information to.
775 """
776
777 if parse_info is None:
778 return
779
780
781 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
782 self._set_location(parse_info, cur_feature)
783 return
784
785 elif isinstance(parse_info, LocationParser.Function):
786 self._set_function(parse_info, cur_feature)
787
788 else:
789 raise ValueError("Could not parse location info: %s"
790 % parse_info)
791
793 """Set the location information for a feature.
794
795 Arguments:
796 o location - An AbsoluteLocation object specifying the info
797 about the location.
798 o cur_feature - The feature to add the information to.
799 """
800
801
802 if location.path is not None:
803 cur_feature.ref = location.path.accession
804 cur_feature.ref_db = location.path.database
805
806 cur_feature.location = self._get_location(location.local_location)
807
809 """Return a (possibly fuzzy) location from a Range object.
810
811 Arguments:
812 o range_info - A location range (ie. something like 67..100). This
813 may also be a single position (ie 27).
814
815 This returns a FeatureLocation object.
816 If parser.use_fuzziness is set at one, the positions for the
817 end points will possibly be fuzzy.
818 """
819 if isinstance(range_info, LocationParser.Between) \
820 and range_info.low.val+1 == range_info.high.val:
821
822
823
824 pos = self._get_position(range_info.low)
825 return SeqFeature.FeatureLocation(pos, pos)
826
827
828
829 elif not(isinstance(range_info, LocationParser.Range)):
830 pos = self._get_position(range_info)
831
832
833 pos.position = pos.position - 1
834 return SeqFeature.FeatureLocation(pos, pos)
835
836 else:
837
838 start_pos = self._get_position(range_info.low)
839 end_pos = self._get_position(range_info.high)
840
841 start_pos.position, end_pos.position = \
842 self._convert_to_python_numbers(start_pos.position,
843 end_pos.position)
844
845 return SeqFeature.FeatureLocation(start_pos, end_pos)
846
848 """Return a (possibly fuzzy) position for a single coordinate.
849
850 Arguments:
851 o position - This is a LocationParser.* object that specifies
852 a single coordinate. We will examine the object to determine
853 the fuzziness of the position.
854
855 This is used with _get_location to parse out a location of any
856 end_point of arbitrary fuzziness.
857 """
858
859 if (isinstance(position, LocationParser.Integer)):
860 final_pos = SeqFeature.ExactPosition(position.val)
861
862 elif isinstance(position, LocationParser.LowBound):
863 final_pos = SeqFeature.AfterPosition(position.base.val)
864
865 elif isinstance(position, LocationParser.HighBound):
866 final_pos = SeqFeature.BeforePosition(position.base.val)
867
868
869 elif isinstance(position, LocationParser.Between):
870
871
872
873
874
875
876 final_pos = SeqFeature.BetweenPosition(position.low.val,
877 position.high.val-position.low.val)
878
879 elif isinstance(position, LocationParser.TwoBound):
880 final_pos = SeqFeature.WithinPosition(position.low.val,
881 position.high.val-position.low.val)
882
883 elif isinstance(position, LocationParser.Function) and \
884 position.name == "one-of":
885
886 position_choices = []
887 for arg in position.args:
888
889
890 assert isinstance(arg, LocationParser.AbsoluteLocation), \
891 "Unhandled Location type %r" % arg
892 assert arg.path is None, "Unhandled path in location"
893 position = self._get_position(arg.local_location)
894 position_choices.append(position)
895 final_pos = SeqFeature.OneOfPosition(position_choices)
896
897 else:
898 raise ValueError("Unexpected LocationParser object %r" %
899 position)
900
901
902 if self._use_fuzziness:
903 return final_pos
904
905 else:
906 return SeqFeature.ExactPosition(final_pos.location)
907
909 """Add a qualifier to the current feature without loss of info.
910
911 If there are multiple qualifier keys with the same name we
912 would lose some info in the dictionary, so we append a unique
913 number to the end of the name in case of conflicts.
914 """
915
916
917 if self._cur_qualifier_key:
918 key = self._cur_qualifier_key
919 value = "".join(self._cur_qualifier_value)
920 if self._feature_cleaner is not None:
921 value = self._feature_cleaner.clean_value(key, value)
922
923 if key in self._cur_feature.qualifiers:
924 self._cur_feature.qualifiers[key].append(value)
925
926 else:
927 self._cur_feature.qualifiers[key] = [value]
928
930 """When we get a qualifier key, use it as a dictionary key.
931
932 We receive a list of keys, since you can have valueless keys such as
933 /pseudo which would be passed in with the next key (since no other
934 tags separate them in the file)
935 """
936 for content in content_list:
937
938 self._add_qualifier()
939
940
941 qual_key = content.replace('/', '')
942 qual_key = qual_key.replace('=', '')
943 qual_key = qual_key.strip()
944
945 self._cur_qualifier_key = qual_key
946 self._cur_qualifier_value = []
947
949
950 qual_value = content.replace('"', '')
951
952 self._cur_qualifier_value.append(qual_value)
953
955 """Deal with CONTIG information.
956
957 Most CONTIG descriptions use a join of other externally referenced
958 sequences. Currently this code tries to use the location parser,
959 and represent this as a SeqFeature with sub-features.
960 """
961
962
963 self._add_feature()
964
965 self._cur_feature = SeqFeature.SeqFeature()
966 self._cur_feature.type = "contig"
967
968
969 self.location(content)
970
971
972 self.data.annotations["contig"] = self._cur_feature
973 self._cur_feature = None
974
977
980
983
985 """Add up sequence information as we get it.
986
987 To try and make things speedier, this puts all of the strings
988 into a list of strings, and then uses string.join later to put
989 them together. Supposedly, this is a big time savings
990 """
991 new_seq = content.replace(' ', '')
992 new_seq = new_seq.upper()
993
994 self._seq_data.append(new_seq)
995
1060
1062 """Create a GenBank Record object from scanner generated information.
1063 """
1073
1074 - def wgs(self, content):
1076
1079
1080 - def locus(self, content):
1082
1083 - def size(self, content):
1085
1088
1091
1092 - def date(self, content):
1094
1097
1102
1103 - def nid(self, content):
1105
1106 - def pid(self, content):
1108
1111
1114
1115 - def gi(self, content):
1117
1120
1123
1126
1129
1132
1135
1138
1140 """Grab the reference number and signal the start of a new reference.
1141 """
1142
1143 if self._cur_reference is not None:
1144 self.data.references.append(self._cur_reference)
1145
1146 self._cur_reference = Record.Reference()
1147 self._cur_reference.number = content
1148
1150 self._cur_reference.bases = content
1151
1153 self._cur_reference.authors = content
1154
1156 self._cur_reference.consrtm = content
1157
1158 - def title(self, content):
1159 self._cur_reference.title = content
1160
1162 self._cur_reference.journal = content
1163
1166
1169
1171 self._cur_reference.remark = content
1172
1175
1179
1182
1184 """Get ready for the feature table when we reach the FEATURE line.
1185 """
1186 self.start_feature_table()
1187
1189 """Signal the start of the feature table.
1190 """
1191
1192 if self._cur_reference is not None:
1193 self.data.references.append(self._cur_reference)
1194
1196 """Grab the key of the feature and signal the start of a new feature.
1197 """
1198
1199 self._add_feature()
1200
1201 self._cur_feature = Record.Feature()
1202 self._cur_feature.key = content
1203
1205 """Utility function to add a feature to the Record.
1206
1207 This does all of the appropriate checking to make sure we haven't
1208 left any info behind, and that we are only adding info if it
1209 exists.
1210 """
1211 if self._cur_feature is not None:
1212
1213
1214 if self._cur_qualifier is not None:
1215 self._cur_feature.qualifiers.append(self._cur_qualifier)
1216
1217 self._cur_qualifier = None
1218 self.data.features.append(self._cur_feature)
1219
1222
1224 """Deal with qualifier names
1225
1226 We receive a list of keys, since you can have valueless keys such as
1227 /pseudo which would be passed in with the next key (since no other
1228 tags separate them in the file)
1229 """
1230 for content in content_list:
1231
1232 if content.find("/") != 0:
1233 content = "/%s" % content
1234
1235 if self._cur_qualifier is not None:
1236 self._cur_feature.qualifiers.append(self._cur_qualifier)
1237
1238 self._cur_qualifier = Record.Qualifier()
1239 self._cur_qualifier.key = content
1240
1252
1254 self.data.base_counts = content
1255
1257 self.data.origin = content
1258
1260 """Signal that we have contig information to add to the record.
1261 """
1262 self.data.contig = self._clean_location(content)
1263
1265 """Add sequence information to a list of sequence strings.
1266
1267 This removes spaces in the data and uppercases the sequence, and
1268 then adds it to a list of sequences. Later on we'll join this
1269 list together to make the final sequence. This is faster than
1270 adding on the new string every time.
1271 """
1272 new_seq = content.replace(' ', '')
1273 self._seq_data.append(new_seq.upper())
1274
1276 """Signal the end of the record and do any necessary clean-up.
1277 """
1278
1279
1280 self.data.sequence = "".join(self._seq_data)
1281
1282 self._add_feature()
1283
1284
1286 """Access GenBank using a read-only dictionary interface (DEPRECATED).
1287
1288 This object is deprecated and will be removed in a future release of
1289 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1290 """
1291 VALID_DATABASES = ['nucleotide', 'protein', 'genome']
1292 VALID_FORMATS = ['genbank', 'fasta']
1293 - def __init__(self, database, format, parser = None):
1294 """Initialize an NCBI dictionary to retrieve sequences.
1295
1296 Create a new Dictionary to access GenBank. Valid values for
1297 database are 'nucleotide' and 'protein'.
1298 Valid values for format are 'genbank' (for nucleotide genbank and
1299 protein genpept) and 'fasta'.
1300 dely and retmax are old options kept only for compatibility -- do not
1301 bother to set them.
1302 parser is an optional parser object
1303 to change the results into another form. If unspecified, then
1304 the raw contents of the file will be returned.
1305 """
1306 import warnings
1307 warnings.warn("Bio.GenBank.NCBIDictionary has been deprecated, and will be"\
1308 " removed in a future release of Biopython. Please use"\
1309 " Bio.Entrez instead which is described in the tutorial.",
1310 DeprecationWarning)
1311
1312 self.parser = parser
1313 if database not in self.__class__.VALID_DATABASES:
1314 raise ValueError("Invalid database %s, should be one of %s" %
1315 (database, self.__class__.VALID_DATABASES))
1316 if format not in self.__class__.VALID_FORMATS:
1317 raise ValueError("Invalid format %s, should be one of %s" %
1318 (format, self.__class__.VALID_FORMATS))
1319
1320 if format=="genbank": format = "gb"
1321 self.db = database
1322 self.format = format
1323
1325 raise NotImplementedError("GenBank contains lots of entries")
1327 raise NotImplementedError("This is a read-only dictionary")
1329 raise NotImplementedError("This is a read-only dictionary")
1331 raise NotImplementedError("This is a read-only dictionary")
1333 raise NotImplementedError("You don't need to do this...")
1335 raise NotImplementedError("You don't really want to do this...")
1337 raise NotImplementedError("You don't really want to do this...")
1339 raise NotImplementedError("You don't really want to do this...")
1340
1342 """S.has_key(id) -> bool"""
1343 try:
1344 self[id]
1345 except KeyError:
1346 return 0
1347 return 1
1348
1349 - def get(self, id, failobj=None):
1350 try:
1351 return self[id]
1352 except KeyError:
1353 return failobj
1354
1356 """Return the GenBank entry specified by the GenBank ID.
1357
1358 Raises a KeyError if there's an error.
1359 """
1360 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format)
1361
1362 if self.parser is not None:
1363 return self.parser.parse(handle)
1364 return handle.read()
1365
1366 -def search_for(search, database='nucleotide',
1367 reldate=None, mindate=None, maxdate=None,
1368 start_id = 0, max_ids = 50000000):
1369 """Do an online search at the NCBI, returns a list of IDs (DEPRECATED).
1370
1371 This function is deprecated and will be removed in a future release of
1372 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1373
1374 Search GenBank and return a list of the GenBank identifiers (gi's)
1375 that match the criteria. search is the search string used to
1376 search the database. Valid values for database are
1377 'nucleotide', 'protein', 'popset' and 'genome'. reldate is
1378 the number of dates prior to the current date to restrict the
1379 search. mindate and maxdate are the dates to restrict the search,
1380 e.g. 2002/12/20. start_id is the number to begin retrieval on.
1381 max_ids specifies the maximum number of id's to retrieve.
1382 """
1383 import warnings
1384 warnings.warn("Bio.GenBank.search_for has been deprecated, and will be"\
1385 " removed in a future release of Biopython. Please use"\
1386 " Bio.Entrez instead which is described in the tutorial.",
1387 DeprecationWarning)
1388
1389
1390
1391
1392
1393
1394 import re
1395 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match
1396 errinfo = None
1397 if mindate is not None and _date_re_match(mindate) is None:
1398 errinfo = ("mindate", mindate)
1399 elif maxdate is not None and _date_re_match(maxdate) is None:
1400 errinfo = ("maxdate", maxdate)
1401 if errinfo:
1402 raise TypeError(
1403 "%s is not in YYYY/MM/DD format (month and "
1404 "day are optional): %r" % errinfo)
1405
1406
1407 handle = Entrez.esearch(database, search, retmode="xml",
1408 retstart=start_id, retmax=max_ids,
1409 mindate=mindate, maxdate=maxdate,
1410 reldate=reldate)
1411 return Entrez.read(handle)["IdList"]
1412
1414 """Download multiple NCBI GenBank records, returned as a handle (DEPRECATED).
1415
1416 This function is deprecated and will be removed in a future release of
1417 Biopython. Please use Bio.Entrez instead as described in the tutorial.
1418
1419 Download many records from GenBank. ids is a list of gis or
1420 accessions.
1421 """
1422 import warnings
1423 warnings.warn("Bio.GenBank.download_many has been deprecated, and will be"\
1424 " removed in a future release of Biopython. Please use"\
1425 " Bio.Entrez instead which is described in the tutorial.",
1426 DeprecationWarning)
1427
1428 if database in ['nucleotide']:
1429 format = 'gb'
1430 elif database in ['protein']:
1431 format = 'gp'
1432 else:
1433 raise ValueError("Unexpected database: %s" % database)
1434
1435 result_handle = Entrez.efetch(database,
1436 id=",".join(ids),
1437 retmode = "text",
1438 rettype = format)
1439 return cStringIO.StringIO(result_handle.read())
1440