Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Also, rather than using Bio.GenBank to search or download files from the NCBI, 
  14  you are now encouraged to use Bio.Entrez instead (again, see the Biopython 
  15  tutorial for details). 
  16   
  17  Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser 
  18  which turns a GenBank file into GenBank-specific Record objects.  This is a 
  19  much closer representation to the raw file contents that the SeqRecord 
  20  alternative from the FeatureParser (used in Bio.SeqIO). 
  21   
  22  Classes: 
  23  Iterator              Iterate through a file of GenBank entries 
  24  ErrorFeatureParser    Catch errors caused during parsing. 
  25  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  26  RecordParser          Parse GenBank data into a Record object. 
  27  NCBIDictionary        Access GenBank using a dictionary interface (DEPRECATED). 
  28   
  29  _BaseGenBankConsumer  A base class for GenBank consumer that implements 
  30                        some helpful functions that are in common between 
  31                        consumers. 
  32  _FeatureConsumer      Create SeqFeature objects from info generated by 
  33                        the Scanner 
  34  _RecordConsumer       Create a GenBank record object from Scanner info. 
  35  _PrintingConsumer     A debugging consumer. 
  36   
  37  ParserFailureError    Exception indicating a failure in the parser (ie. 
  38                        scanner or consumer) 
  39  LocationParserError   Exception indiciating a problem with the spark based 
  40                        location parser. 
  41   
  42  Functions: 
  43  search_for            Do a query against GenBank (DEPRECATED). 
  44  download_many         Download many GenBank records (DEPRECATED). 
  45   
  46  17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  47  These are GenBank files that summarize the content of a project, and provide lists of 
  48  scaffold and contig files in the project. These will be in annotations['wgs'] and 
  49  annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  50  http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  51   
  52  http://is.gd/nNgk 
  53  for more details of this format, and an example. 
  54  Added by Ying Huang & Iddo Friedberg 
  55  """ 
  56  import cStringIO 
  57   
  58  # other Biopython stuff 
  59  from Bio import SeqFeature 
  60  from Bio.ParserSupport import AbstractConsumer 
  61  from Bio import Entrez 
  62   
  63  # other Bio.GenBank stuff 
  64  import LocationParser 
  65  from utils import FeatureValueCleaner 
  66  from Scanner import GenBankScanner 
  67   
  68  #Constants used to parse GenBank header lines 
  69  GENBANK_INDENT = 12 
  70  GENBANK_SPACER = " " * GENBANK_INDENT 
  71   
  72  #Constants for parsing GenBank feature lines 
  73  FEATURE_KEY_INDENT = 5 
  74  FEATURE_QUALIFIER_INDENT = 21 
  75  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  76  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  77   
78 -class Iterator:
79 """Iterator interface to move over a file of GenBank entries one at a time. 80 """
81 - def __init__(self, handle, parser = None):
82 """Initialize the iterator. 83 84 Arguments: 85 o handle - A handle with GenBank entries to iterate through. 86 o parser - An optional parser to pass the entries through before 87 returning them. If None, then the raw entry will be returned. 88 """ 89 self.handle = handle 90 self._parser = parser
91
92 - def next(self):
93 """Return the next GenBank record from the handle. 94 95 Will return None if we ran out of records. 96 """ 97 if self._parser is None : 98 lines = [] 99 while True : 100 line = self.handle.readline() 101 if not line : return None #Premature end of file? 102 lines.append(line) 103 if line.rstrip() == "//" : break 104 return "".join(lines) 105 try : 106 return self._parser.parse(self.handle) 107 except StopIteration : 108 return None
109
110 - def __iter__(self):
111 return iter(self.next, None)
112
113 -class ParserFailureError(Exception):
114 """Failure caused by some kind of problem in the parser. 115 """ 116 pass
117
118 -class LocationParserError(Exception):
119 """Could not Properly parse out a location from a GenBank file. 120 """ 121 pass
122
123 -class FeatureParser:
124 """Parse GenBank files into Seq + Feature objects. 125 """
126 - def __init__(self, debug_level = 0, use_fuzziness = 1, 127 feature_cleaner = FeatureValueCleaner()):
128 """Initialize a GenBank parser and Feature consumer. 129 130 Arguments: 131 o debug_level - An optional argument that species the amount of 132 debugging information the parser should spit out. By default we have 133 no debugging info (the fastest way to do things), but if you want 134 you can set this as high as two and see exactly where a parse fails. 135 o use_fuzziness - Specify whether or not to use fuzzy representations. 136 The default is 1 (use fuzziness). 137 o feature_cleaner - A class which will be used to clean out the 138 values of features. This class must implement the function 139 clean_value. GenBank.utils has a "standard" cleaner class, which 140 is used by default. 141 """ 142 self._scanner = GenBankScanner(debug_level) 143 self.use_fuzziness = use_fuzziness 144 self._cleaner = feature_cleaner
145
146 - def parse(self, handle):
147 """Parse the specified handle. 148 """ 149 self._consumer = _FeatureConsumer(self.use_fuzziness, 150 self._cleaner) 151 self._scanner.feed(handle, self._consumer) 152 return self._consumer.data
153
154 -class RecordParser:
155 """Parse GenBank files into Record objects 156 """
157 - def __init__(self, debug_level = 0):
158 """Initialize the parser. 159 160 Arguments: 161 o debug_level - An optional argument that species the amount of 162 debugging information the parser should spit out. By default we have 163 no debugging info (the fastest way to do things), but if you want 164 you can set this as high as two and see exactly where a parse fails. 165 """ 166 self._scanner = GenBankScanner(debug_level)
167
168 - def parse(self, handle):
169 """Parse the specified handle into a GenBank record. 170 """ 171 self._consumer = _RecordConsumer() 172 self._scanner.feed(handle, self._consumer) 173 return self._consumer.data
174
175 -class _BaseGenBankConsumer(AbstractConsumer):
176 """Abstract GenBank consumer providing useful general functions. 177 178 This just helps to eliminate some duplication in things that most 179 GenBank consumers want to do. 180 """ 181 # Special keys in GenBank records that we should remove spaces from 182 # For instance, \translation keys have values which are proteins and 183 # should have spaces and newlines removed from them. This class 184 # attribute gives us more control over specific formatting problems. 185 remove_space_keys = ["translation"] 186
187 - def __init__(self):
188 pass
189
190 - def _split_keywords(self, keyword_string):
191 """Split a string of keywords into a nice clean list. 192 """ 193 # process the keywords into a python list 194 if keyword_string == "" or keyword_string == "." : 195 keywords = "" 196 elif keyword_string[-1] == '.': 197 keywords = keyword_string[:-1] 198 else: 199 keywords = keyword_string 200 keyword_list = keywords.split(';') 201 clean_keyword_list = [x.strip() for x in keyword_list] 202 return clean_keyword_list
203
204 - def _split_accessions(self, accession_string):
205 """Split a string of accession numbers into a list. 206 """ 207 # first replace all line feeds with spaces 208 # Also, EMBL style accessions are split with ';' 209 accession = accession_string.replace("\n", " ").replace(";"," ") 210 211 return [x.strip() for x in accession.split() if x.strip()]
212
213 - def _split_taxonomy(self, taxonomy_string):
214 """Split a string with taxonomy info into a list. 215 """ 216 if not taxonomy_string or taxonomy_string=="." : 217 #Missing data, no taxonomy 218 return [] 219 220 if taxonomy_string[-1] == '.': 221 tax_info = taxonomy_string[:-1] 222 else: 223 tax_info = taxonomy_string 224 tax_list = tax_info.split(';') 225 new_tax_list = [] 226 for tax_item in tax_list: 227 new_items = tax_item.split("\n") 228 new_tax_list.extend(new_items) 229 while '' in new_tax_list: 230 new_tax_list.remove('') 231 clean_tax_list = [x.strip() for x in new_tax_list] 232 233 return clean_tax_list
234
235 - def _clean_location(self, location_string):
236 """Clean whitespace out of a location string. 237 238 The location parser isn't a fan of whitespace, so we clean it out 239 before feeding it into the parser. 240 """ 241 #Originally this imported string.whitespace and did a replace 242 #via a loop. It's simpler to just split on whitespace and rejoin 243 #the string - and this avoids importing string too. See Bug 2684. 244 return ''.join(location_string.split())
245
246 - def _remove_newlines(self, text):
247 """Remove any newlines in the passed text, returning the new string. 248 """ 249 # get rid of newlines in the qualifier value 250 newlines = ["\n", "\r"] 251 for ws in newlines: 252 text = text.replace(ws, "") 253 254 return text
255
256 - def _normalize_spaces(self, text):
257 """Replace multiple spaces in the passed text with single spaces. 258 """ 259 # get rid of excessive spaces 260 text_parts = text.split(" ") 261 text_parts = filter(None, text_parts) 262 return ' '.join(text_parts)
263
264 - def _remove_spaces(self, text):
265 """Remove all spaces from the passed text. 266 """ 267 return text.replace(" ", "")
268
269 - def _convert_to_python_numbers(self, start, end):
270 """Convert a start and end range to python notation. 271 272 In GenBank, starts and ends are defined in "biological" coordinates, 273 where 1 is the first base and [i, j] means to include both i and j. 274 275 In python, 0 is the first base and [i, j] means to include i, but 276 not j. 277 278 So, to convert "biological" to python coordinates, we need to 279 subtract 1 from the start, and leave the end and things should 280 be converted happily. 281 """ 282 new_start = start - 1 283 new_end = end 284 285 return new_start, new_end
286
287 -class _FeatureConsumer(_BaseGenBankConsumer):
288 """Create a SeqRecord object with Features to return. 289 290 Attributes: 291 o use_fuzziness - specify whether or not to parse with fuzziness in 292 feature locations. 293 o feature_cleaner - a class that will be used to provide specialized 294 cleaning-up of feature values. 295 """
296 - def __init__(self, use_fuzziness, feature_cleaner = None):
297 from Bio.SeqRecord import SeqRecord 298 _BaseGenBankConsumer.__init__(self) 299 self.data = SeqRecord(None, id = None) 300 self.data.id = None 301 self.data.description = "" 302 303 self._use_fuzziness = use_fuzziness 304 self._feature_cleaner = feature_cleaner 305 306 self._seq_type = '' 307 self._seq_data = [] 308 self._current_ref = None 309 self._cur_feature = None 310 self._cur_qualifier_key = None 311 self._cur_qualifier_value = None 312 self._expected_size = None
313
314 - def locus(self, locus_name):
315 """Set the locus name is set as the name of the Sequence. 316 """ 317 self.data.name = locus_name
318
319 - def size(self, content):
320 """Record the sequence length.""" 321 self._expected_size = int(content)
322
323 - def residue_type(self, type):
324 """Record the sequence type so we can choose an appropriate alphabet. 325 """ 326 self._seq_type = type
327
328 - def data_file_division(self, division):
329 self.data.annotations['data_file_division'] = division
330
331 - def date(self, submit_date):
332 self.data.annotations['date'] = submit_date
333
334 - def definition(self, definition):
335 """Set the definition as the description of the sequence. 336 """ 337 if self.data.description : 338 #Append to any existing description 339 #e.g. EMBL files with two DE lines. 340 self.data.description += " " + definition 341 else : 342 self.data.description = definition
343
344 - def accession(self, acc_num):
345 """Set the accession number as the id of the sequence. 346 347 If we have multiple accession numbers, the first one passed is 348 used. 349 """ 350 new_acc_nums = self._split_accessions(acc_num) 351 352 #Also record them ALL in the annotations 353 try : 354 #On the off chance there was more than one accession line: 355 for acc in new_acc_nums : 356 #Prevent repeat entries 357 if acc not in self.data.annotations['accessions'] : 358 self.data.annotations['accessions'].append(acc) 359 except KeyError : 360 self.data.annotations['accessions'] = new_acc_nums 361 362 # if we haven't set the id information yet, add the first acc num 363 if self.data.id is None: 364 if len(new_acc_nums) > 0: 365 #self.data.id = new_acc_nums[0] 366 #Use the FIRST accession as the ID, not the first on this line! 367 self.data.id = self.data.annotations['accessions'][0]
368
369 - def wgs(self, content):
370 self.data.annotations['wgs'] = content.split('-')
371
372 - def add_wgs_scafld(self, content):
373 self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
374
375 - def nid(self, content):
376 self.data.annotations['nid'] = content
377
378 - def pid(self, content):
379 self.data.annotations['pid'] = content
380
381 - def version(self, version_id):
382 #Want to use the versioned accession as the record.id 383 #This comes from the VERSION line in GenBank files, or the 384 #obsolete SV line in EMBL. For the new EMBL files we need 385 #both the version suffix from the ID line and the accession 386 #from the AC line. 387 if version_id.count(".")==1 and version_id.split(".")[1].isdigit() : 388 self.accession(version_id.split(".")[0]) 389 self.version_suffix(version_id.split(".")[1]) 390 else : 391 #For backwards compatibility... 392 self.data.id = version_id
393
394 - def project(self, content):
395 """Handle the information from the PROJECT line as a list of projects. 396 397 e.g. 398 PROJECT GenomeProject:28471 399 400 or: 401 PROJECT GenomeProject:13543 GenomeProject:99999 402 403 This is stored as dbxrefs in the SeqRecord to be consistent with the 404 projected switch of this line to DBLINK in future GenBank versions. 405 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 406 "Project:28471" as part of this transition. 407 """ 408 content = content.replace("GenomeProject:", "Project:") 409 self.data.dbxrefs.extend([p for p in content.split() if p])
410 436
437 - def version_suffix(self, version):
438 """Set the version to overwrite the id. 439 440 Since the verison provides the same information as the accession 441 number, plus some extra info, we set this as the id if we have 442 a version. 443 """ 444 #e.g. GenBank line: 445 #VERSION U49845.1 GI:1293613 446 #or the obsolete EMBL line: 447 #SV U49845.1 448 #Scanner calls consumer.version("U49845.1") 449 #which then calls consumer.version_suffix(1) 450 # 451 #e.g. EMBL new line: 452 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 453 #Scanner calls consumer.version_suffix(1) 454 assert version.isdigit() 455 self.data.annotations['sequence_version'] = int(version)
456
457 - def db_source(self, content):
458 self.data.annotations['db_source'] = content.rstrip()
459
460 - def gi(self, content):
461 self.data.annotations['gi'] = content
462
463 - def keywords(self, content):
464 self.data.annotations['keywords'] = self._split_keywords(content)
465
466 - def segment(self, content):
467 self.data.annotations['segment'] = content
468
469 - def source(self, content):
470 #Note that some software (e.g. VectorNTI) may produce an empty 471 #source (rather than using a dot/period as might be expected). 472 if content == "" : 473 source_info = "" 474 elif content[-1] == '.': 475 source_info = content[:-1] 476 else: 477 source_info = content 478 self.data.annotations['source'] = source_info
479
480 - def organism(self, content):
481 self.data.annotations['organism'] = content
482
483 - def taxonomy(self, content):
484 """Records (another line of) the taxonomy lineage. 485 """ 486 lineage = self._split_taxonomy(content) 487 try : 488 self.data.annotations['taxonomy'].extend(lineage) 489 except KeyError : 490 self.data.annotations['taxonomy'] = lineage
491
492 - def reference_num(self, content):
493 """Signal the beginning of a new reference object. 494 """ 495 # if we have a current reference that hasn't been added to 496 # the list of references, add it. 497 if self._current_ref is not None: 498 self.data.annotations['references'].append(self._current_ref) 499 else: 500 self.data.annotations['references'] = [] 501 502 self._current_ref = SeqFeature.Reference()
503
504 - def reference_bases(self, content):
505 """Attempt to determine the sequence region the reference entails. 506 507 Possible types of information we may have to deal with: 508 509 (bases 1 to 86436) 510 (sites) 511 (bases 1 to 105654; 110423 to 111122) 512 1 (residues 1 to 182) 513 """ 514 # first remove the parentheses or other junk 515 ref_base_info = content[1:-1] 516 517 all_locations = [] 518 # parse if we've got 'bases' and 'to' 519 if ref_base_info.find('bases') != -1 and \ 520 ref_base_info.find('to') != -1: 521 # get rid of the beginning 'bases' 522 ref_base_info = ref_base_info[5:] 523 locations = self._split_reference_locations(ref_base_info) 524 all_locations.extend(locations) 525 elif (ref_base_info.find("residues") >= 0 and 526 ref_base_info.find("to") >= 0): 527 residues_start = ref_base_info.find("residues") 528 # get only the information after "residues" 529 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 530 locations = self._split_reference_locations(ref_base_info) 531 all_locations.extend(locations) 532 533 # make sure if we are not finding information then we have 534 # the string 'sites' or the string 'bases' 535 elif (ref_base_info == 'sites' or 536 ref_base_info.strip() == 'bases'): 537 pass 538 # otherwise raise an error 539 else: 540 raise ValueError("Could not parse base info %s in record %s" % 541 (ref_base_info, self.data.id)) 542 543 self._current_ref.location = all_locations
544
545 - def _split_reference_locations(self, location_string):
546 """Get reference locations out of a string of reference information 547 548 The passed string should be of the form: 549 550 1 to 20; 20 to 100 551 552 This splits the information out and returns a list of location objects 553 based on the reference locations. 554 """ 555 # split possibly multiple locations using the ';' 556 all_base_info = location_string.split(';') 557 558 new_locations = [] 559 for base_info in all_base_info: 560 start, end = base_info.split('to') 561 new_start, new_end = \ 562 self._convert_to_python_numbers(int(start.strip()), 563 int(end.strip())) 564 this_location = SeqFeature.FeatureLocation(new_start, new_end) 565 new_locations.append(this_location) 566 return new_locations
567
568 - def authors(self, content):
569 if self._current_ref.authors : 570 self._current_ref.authors += ' ' + content 571 else : 572 self._current_ref.authors = content
573
574 - def consrtm(self, content):
575 if self._current_ref.consrtm : 576 self._current_ref.consrtm += ' ' + content 577 else : 578 self._current_ref.consrtm = content
579
580 - def title(self, content):
581 if self._current_ref.title : 582 self._current_ref.title += ' ' + content 583 else : 584 self._current_ref.title = content
585
586 - def journal(self, content):
587 if self._current_ref.journal : 588 self._current_ref.journal += ' ' + content 589 else : 590 self._current_ref.journal = content
591
592 - def medline_id(self, content):
593 self._current_ref.medline_id = content
594
595 - def pubmed_id(self, content):
596 self._current_ref.pubmed_id = content
597
598 - def remark(self, content):
599 """Deal with a reference comment.""" 600 if self._current_ref.comment : 601 self._current_ref.comment += ' ' + content 602 else : 603 self._current_ref.comment = content
604
605 - def comment(self, content):
606 try : 607 self.data.annotations['comment'] += "\n" + "\n".join(content) 608 except KeyError : 609 self.data.annotations['comment'] = "\n".join(content)
610
611 - def features_line(self, content):
612 """Get ready for the feature table when we reach the FEATURE line. 613 """ 614 self.start_feature_table()
615
616 - def start_feature_table(self):
617 """Indicate we've got to the start of the feature table. 618 """ 619 # make sure we've added on our last reference object 620 if self._current_ref is not None: 621 self.data.annotations['references'].append(self._current_ref) 622 self._current_ref = None
623
624 - def _add_feature(self):
625 """Utility function to add a feature to the SeqRecord. 626 627 This does all of the appropriate checking to make sure we haven't 628 left any info behind, and that we are only adding info if it 629 exists. 630 """ 631 if self._cur_feature: 632 # if we have a left over qualifier, add it to the qualifiers 633 # on the current feature 634 self._add_qualifier() 635 636 self._cur_qualifier_key = '' 637 self._cur_qualifier_value = '' 638 self.data.features.append(self._cur_feature)
639
640 - def feature_key(self, content):
641 # if we already have a feature, add it on 642 self._add_feature() 643 644 # start a new feature 645 self._cur_feature = SeqFeature.SeqFeature() 646 self._cur_feature.type = content 647 648 # assume positive strand to start with if we have DNA or cDNA 649 # (labelled as mRNA). The complement in the location will 650 # change this later if something is on the reverse strand 651 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 652 self._cur_feature.strand = 1
653
654 - def location(self, content):
655 """Parse out location information from the location string. 656 657 This uses a comprehensive but slow spark based parser to do the 658 parsing, and then translates the results of the parse into appropriate 659 Location objects. 660 """ 661 # --- first preprocess the location for the spark parser 662 663 # we need to clean up newlines and other whitespace inside 664 # the location before feeding it to the parser. 665 # locations should have no whitespace whatsoever based on the 666 # grammer 667 location_line = self._clean_location(content) 668 669 # Older records have junk like replace(266,"c") in the 670 # location line. Newer records just replace this with 671 # the number 266 and have the information in a more reasonable 672 # place. So we'll just grab out the number and feed this to the 673 # parser. We shouldn't really be losing any info this way. 674 if location_line.find('replace') != -1: 675 comma_pos = location_line.find(',') 676 location_line = location_line[8:comma_pos] 677 678 # feed everything into the scanner and parser 679 try: 680 parse_info = \ 681 LocationParser.parse(LocationParser.scan(location_line)) 682 # spark raises SystemExit errors when parsing fails 683 except SystemExit: 684 raise LocationParserError(location_line) 685 686 # print "parse_info:", repr(parse_info) 687 688 # add the parser information the current feature 689 self._set_location_info(parse_info, self._cur_feature)
690
691 - def _set_function(self, function, cur_feature):
692 """Set the location information based on a function. 693 694 This handles all of the location functions like 'join', 'complement' 695 and 'order'. 696 697 Arguments: 698 o function - A LocationParser.Function object specifying the 699 function we are acting on. 700 o cur_feature - The feature to add information to. 701 """ 702 assert isinstance(function, LocationParser.Function), \ 703 "Expected a Function object, got %s" % function 704 705 if function.name == "complement": 706 # mark the current feature as being on the opposite strand 707 cur_feature.strand = -1 708 # recursively deal with whatever is left inside the complement 709 for inner_info in function.args: 710 self._set_location_info(inner_info, cur_feature) 711 # deal with functions that have multipe internal segments that 712 # are connected somehow. 713 # join and order are current documented functions. 714 # one-of is something I ran across in old files. Treating it 715 # as a sub sequence feature seems appropriate to me. 716 # bond is some piece of junk I found in RefSeq files. I have 717 # no idea how to interpret it, so I jam it in here 718 elif (function.name == "join" or function.name == "order" or 719 function.name == "one-of" or function.name == "bond"): 720 self._set_ordering_info(function, cur_feature) 721 elif (function.name == "gap"): 722 assert len(function.args) == 1, \ 723 "Unexpected number of arguments in gap %s" % function.args 724 # make the cur information location a gap object 725 position = self._get_position(function.args[0].local_location) 726 cur_feature.location = SeqFeature.PositionGap(position) 727 else: 728 raise ValueError("Unexpected function name: %s" % function.name)
729
730 - def _set_ordering_info(self, function, cur_feature):
731 """Parse a join or order and all of the information in it. 732 733 This deals with functions that order a bunch of locations, 734 specifically 'join' and 'order'. The inner locations are 735 added as subfeatures of the top level feature 736 """ 737 # for each inner element, create a sub SeqFeature within the 738 # current feature, then get the information for this feature 739 cur_feature.location_operator = function.name 740 for inner_element in function.args: 741 new_sub_feature = SeqFeature.SeqFeature() 742 # inherit the type from the parent 743 new_sub_feature.type = cur_feature.type 744 # add the join or order info to the location_operator 745 new_sub_feature.location_operator = function.name 746 # inherit references and strand from the parent feature 747 new_sub_feature.ref = cur_feature.ref 748 new_sub_feature.ref_db = cur_feature.ref_db 749 new_sub_feature.strand = cur_feature.strand 750 751 # set the information for the inner element 752 self._set_location_info(inner_element, new_sub_feature) 753 754 # now add the feature to the sub_features 755 cur_feature.sub_features.append(new_sub_feature) 756 757 # set the location of the top -- this should be a combination of 758 # the start position of the first sub_feature and the end position 759 # of the last sub_feature 760 761 # these positions are already converted to python coordinates 762 # (when the sub_features were added) so they don't need to 763 # be converted again 764 feature_start = cur_feature.sub_features[0].location.start 765 feature_end = cur_feature.sub_features[-1].location.end 766 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 767 feature_end)
768
769 - def _set_location_info(self, parse_info, cur_feature):
770 """Set the location information for a feature from the parse info. 771 772 Arguments: 773 o parse_info - The classes generated by the LocationParser. 774 o cur_feature - The feature to add the information to. 775 """ 776 # base case -- we are out of information 777 if parse_info is None: 778 return 779 # parse a location -- this is another base_case -- we assume 780 # we have no information after a single location 781 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 782 self._set_location(parse_info, cur_feature) 783 return 784 # parse any of the functions (join, complement, etc) 785 elif isinstance(parse_info, LocationParser.Function): 786 self._set_function(parse_info, cur_feature) 787 # otherwise we are stuck and should raise an error 788 else: 789 raise ValueError("Could not parse location info: %s" 790 % parse_info)
791
792 - def _set_location(self, location, cur_feature):
793 """Set the location information for a feature. 794 795 Arguments: 796 o location - An AbsoluteLocation object specifying the info 797 about the location. 798 o cur_feature - The feature to add the information to. 799 """ 800 # check to see if we have a cross reference to another accession 801 # ie. U05344.1:514..741 802 if location.path is not None: 803 cur_feature.ref = location.path.accession 804 cur_feature.ref_db = location.path.database 805 # now get the actual location information 806 cur_feature.location = self._get_location(location.local_location)
807
808 - def _get_location(self, range_info):
809 """Return a (possibly fuzzy) location from a Range object. 810 811 Arguments: 812 o range_info - A location range (ie. something like 67..100). This 813 may also be a single position (ie 27). 814 815 This returns a FeatureLocation object. 816 If parser.use_fuzziness is set at one, the positions for the 817 end points will possibly be fuzzy. 818 """ 819 if isinstance(range_info, LocationParser.Between) \ 820 and range_info.low.val+1 == range_info.high.val: 821 #A between location like "67^68" (one based counting) is a 822 #special case (note it has zero length). In python slice 823 #notation this is 67:67, a zero length slice. See Bug 2622 824 pos = self._get_position(range_info.low) 825 return SeqFeature.FeatureLocation(pos, pos) 826 #NOTE - We can imagine between locations like "2^4", but this 827 #is just "3". Similarly, "2^5" is just "3..4" 828 # check if we just have a single base 829 elif not(isinstance(range_info, LocationParser.Range)): 830 pos = self._get_position(range_info) 831 # move the single position back one to be consistent with how 832 # python indexes numbers (starting at 0) 833 pos.position = pos.position - 1 834 return SeqFeature.FeatureLocation(pos, pos) 835 # otherwise we need to get both sides of the range 836 else: 837 # get *Position objects for the start and end 838 start_pos = self._get_position(range_info.low) 839 end_pos = self._get_position(range_info.high) 840 841 start_pos.position, end_pos.position = \ 842 self._convert_to_python_numbers(start_pos.position, 843 end_pos.position) 844 845 return SeqFeature.FeatureLocation(start_pos, end_pos)
846
847 - def _get_position(self, position):
848 """Return a (possibly fuzzy) position for a single coordinate. 849 850 Arguments: 851 o position - This is a LocationParser.* object that specifies 852 a single coordinate. We will examine the object to determine 853 the fuzziness of the position. 854 855 This is used with _get_location to parse out a location of any 856 end_point of arbitrary fuzziness. 857 """ 858 # case 1 -- just a normal number 859 if (isinstance(position, LocationParser.Integer)): 860 final_pos = SeqFeature.ExactPosition(position.val) 861 # case 2 -- we've got a > sign 862 elif isinstance(position, LocationParser.LowBound): 863 final_pos = SeqFeature.AfterPosition(position.base.val) 864 # case 3 -- we've got a < sign 865 elif isinstance(position, LocationParser.HighBound): 866 final_pos = SeqFeature.BeforePosition(position.base.val) 867 # case 4 -- we've got 100^101 868 # Is the extension is zero in this example? 869 elif isinstance(position, LocationParser.Between): 870 #NOTE - We don't *expect* this code to get called! 871 #We only except between locations like 3^4 (consecutive) 872 #which are handled in _get_location. We don't expect 873 #non consecutive variants like "2^5" as this is just "3..4". 874 #Similarly there is no reason to expect composite locations 875 #like "(3^4)..6" which should just be "4..6". 876 final_pos = SeqFeature.BetweenPosition(position.low.val, 877 position.high.val-position.low.val) 878 # case 5 -- we've got (100.101) 879 elif isinstance(position, LocationParser.TwoBound): 880 final_pos = SeqFeature.WithinPosition(position.low.val, 881 position.high.val-position.low.val) 882 # case 6 -- we've got a one-of(100, 110) location 883 elif isinstance(position, LocationParser.Function) and \ 884 position.name == "one-of": 885 # first convert all of the arguments to positions 886 position_choices = [] 887 for arg in position.args: 888 # we only handle AbsoluteLocations with no path 889 # right now. Not sure if other cases will pop up 890 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 891 "Unhandled Location type %r" % arg 892 assert arg.path is None, "Unhandled path in location" 893 position = self._get_position(arg.local_location) 894 position_choices.append(position) 895 final_pos = SeqFeature.OneOfPosition(position_choices) 896 # if it is none of these cases we've got a problem! 897 else: 898 raise ValueError("Unexpected LocationParser object %r" % 899 position) 900 901 # if we are using fuzziness return what we've got 902 if self._use_fuzziness: 903 return final_pos 904 # otherwise return an ExactPosition equivalent 905 else: 906 return SeqFeature.ExactPosition(final_pos.location)
907
908 - def _add_qualifier(self):
909 """Add a qualifier to the current feature without loss of info. 910 911 If there are multiple qualifier keys with the same name we 912 would lose some info in the dictionary, so we append a unique 913 number to the end of the name in case of conflicts. 914 """ 915 # if we've got a key from before, add it to the dictionary of 916 # qualifiers 917 if self._cur_qualifier_key: 918 key = self._cur_qualifier_key 919 value = "".join(self._cur_qualifier_value) 920 if self._feature_cleaner is not None: 921 value = self._feature_cleaner.clean_value(key, value) 922 # if the qualifier name exists, append the value 923 if key in self._cur_feature.qualifiers: 924 self._cur_feature.qualifiers[key].append(value) 925 # otherwise start a new list of the key with its values 926 else: 927 self._cur_feature.qualifiers[key] = [value]
928
929 - def feature_qualifier_name(self, content_list):
930 """When we get a qualifier key, use it as a dictionary key. 931 932 We receive a list of keys, since you can have valueless keys such as 933 /pseudo which would be passed in with the next key (since no other 934 tags separate them in the file) 935 """ 936 for content in content_list: 937 # add a qualifier if we've got one 938 self._add_qualifier() 939 940 # remove the / and = from the qualifier if they're present 941 qual_key = content.replace('/', '') 942 qual_key = qual_key.replace('=', '') 943 qual_key = qual_key.strip() 944 945 self._cur_qualifier_key = qual_key 946 self._cur_qualifier_value = []
947
948 - def feature_qualifier_description(self, content):
949 # get rid of the quotes surrounding the qualifier if we've got 'em 950 qual_value = content.replace('"', '') 951 952 self._cur_qualifier_value.append(qual_value)
953
954 - def contig_location(self, content):
955 """Deal with CONTIG information. 956 957 Most CONTIG descriptions use a join of other externally referenced 958 sequences. Currently this code tries to use the location parser, 959 and represent this as a SeqFeature with sub-features. 960 """ 961 # add a last feature if is hasn't been added, 962 # so that we don't overwrite it 963 self._add_feature() 964 # make a feature to add the information to 965 self._cur_feature = SeqFeature.SeqFeature() 966 self._cur_feature.type = "contig" 967 # now set the location on the feature using the standard 968 # location handler 969 self.location(content) 970 # add the contig information to the annotations and get rid 971 # of the feature to prevent it from being added to the feature table 972 self.data.annotations["contig"] = self._cur_feature 973 self._cur_feature = None
974
975 - def origin_name(self, content):
976 pass
977
978 - def base_count(self, content):
979 pass
980
981 - def base_number(self, content):
982 pass
983
984 - def sequence(self, content):
985 """Add up sequence information as we get it. 986 987 To try and make things speedier, this puts all of the strings 988 into a list of strings, and then uses string.join later to put 989 them together. Supposedly, this is a big time savings 990 """ 991 new_seq = content.replace(' ', '') 992 new_seq = new_seq.upper() 993 994 self._seq_data.append(new_seq)
995
996 - def record_end(self, content):
997 """Clean up when we've finished the record. 998 """ 999 from Bio import Alphabet 1000 from Bio.Alphabet import IUPAC 1001 from Bio.Seq import Seq, UnknownSeq 1002 1003 #Try and append the version number to the accession for the full id 1004 if self.data.id is None : 1005 assert 'accessions' not in self.data.annotations, \ 1006 self.data.annotations['accessions'] 1007 self.data.id = self.data.name #Good fall back? 1008 elif self.data.id.count('.') == 0 : 1009 try : 1010 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1011 except KeyError : 1012 pass 1013 1014 # add the last feature in the table which hasn't been added yet 1015 self._add_feature() 1016 1017 # add the sequence information 1018 # first, determine the alphabet 1019 # we default to an generic alphabet if we don't have a 1020 # seq type or have strange sequence information. 1021 seq_alphabet = Alphabet.generic_alphabet 1022 1023 # now set the sequence 1024 sequence = "".join(self._seq_data) 1025 1026 if self._expected_size is not None \ 1027 and len(sequence) != 0 \ 1028 and self._expected_size != len(sequence) : 1029 raise ValueError("Expected sequence length %i, found %i." \ 1030 % (self._expected_size, len(sequence))) 1031 1032 if self._seq_type: 1033 # mRNA is really also DNA, since it is actually cDNA 1034 if self._seq_type.find('DNA') != -1 or \ 1035 self._seq_type.find('mRNA') != -1: 1036 seq_alphabet = IUPAC.ambiguous_dna 1037 # are there ever really RNA sequences in GenBank? 1038 elif self._seq_type.find('RNA') != -1: 1039 #Even for data which was from RNA, the sequence string 1040 #is usually given as DNA (T not U). Bug 2408 1041 if "T" in sequence and "U" not in sequence: 1042 seq_alphabet = IUPAC.ambiguous_dna 1043 else : 1044 seq_alphabet = IUPAC.ambiguous_rna 1045 elif self._seq_type.find('PROTEIN') != -1 : 1046 seq_alphabet = IUPAC.protein # or extended protein? 1047 # work around ugly GenBank records which have circular or 1048 # linear but no indication of sequence type 1049 elif self._seq_type in ["circular", "linear"]: 1050 pass 1051 # we have a bug if we get here 1052 else: 1053 raise ValueError("Could not determine alphabet for seq_type %s" 1054 % self._seq_type) 1055 1056 if not sequence and self.__expected_size : 1057 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1058 else : 1059 self.data.seq = Seq(sequence, seq_alphabet)
1060
1061 -class _RecordConsumer(_BaseGenBankConsumer):
1062 """Create a GenBank Record object from scanner generated information. 1063 """
1064 - def __init__(self):
1065 _BaseGenBankConsumer.__init__(self) 1066 import Record 1067 self.data = Record.Record() 1068 1069 self._seq_data = [] 1070 self._cur_reference = None 1071 self._cur_feature = None 1072 self._cur_qualifier = None
1073
1074 - def wgs(self, content):
1075 self.data.wgs = content.split('-')
1076
1077 - def add_wgs_scafld(self, content):
1078 self.data.wgs_scafld.append(content.split('-'))
1079
1080 - def locus(self, content):
1081 self.data.locus = content
1082
1083 - def size(self, content):
1084 self.data.size = content
1085
1086 - def residue_type(self, content):
1087 self.data.residue_type = content
1088
1089 - def data_file_division(self, content):
1090 self.data.data_file_division = content
1091
1092 - def date(self, content):
1093 self.data.date = content
1094
1095 - def definition(self, content):
1096 self.data.definition = content
1097
1098 - def accession(self, content):
1099 for acc in self._split_accessions(content) : 1100 if acc not in self.data.accession : 1101 self.data.accession.append(acc)
1102
1103 - def nid(self, content):
1104 self.data.nid = content
1105
1106 - def pid(self, content):
1107 self.data.pid = content
1108
1109 - def version(self, content):
1110 self.data.version = content
1111
1112 - def db_source(self, content):
1113 self.data.db_source = content.rstrip()
1114
1115 - def gi(self, content):
1116 self.data.gi = content
1117
1118 - def keywords(self, content):
1119 self.data.keywords = self._split_keywords(content)
1120
1121 - def project(self, content):
1122 self.data.projects.extend([p for p in content.split() if p])
1123 1126
1127 - def segment(self, content):
1128 self.data.segment = content
1129
1130 - def source(self, content):
1131 self.data.source = content
1132
1133 - def organism(self, content):
1134 self.data.organism = content
1135
1136 - def taxonomy(self, content):
1137 self.data.taxonomy = self._split_taxonomy(content)
1138
1139 - def reference_num(self, content):
1140 """Grab the reference number and signal the start of a new reference. 1141 """ 1142 # check if we have a reference to add 1143 if self._cur_reference is not None: 1144 self.data.references.append(self._cur_reference) 1145 1146 self._cur_reference = Record.Reference() 1147 self._cur_reference.number = content
1148
1149 - def reference_bases(self, content):
1150 self._cur_reference.bases = content
1151
1152 - def authors(self, content):
1153 self._cur_reference.authors = content
1154
1155 - def consrtm(self, content):
1156 self._cur_reference.consrtm = content
1157
1158 - def title(self, content):
1159 self._cur_reference.title = content
1160
1161 - def journal(self, content):
1162 self._cur_reference.journal = content
1163
1164 - def medline_id(self, content):
1165 self._cur_reference.medline_id = content
1166
1167 - def pubmed_id(self, content):
1168 self._cur_reference.pubmed_id = content
1169
1170 - def remark(self, content):
1171 self._cur_reference.remark = content
1172
1173 - def comment(self, content):
1174 self.data.comment += "\n".join(content)
1175
1176 - def primary_ref_line(self,content):
1177 """Data for the PRIMARY line""" 1178 self.data.primary.append(content)
1179
1180 - def primary(self,content):
1181 pass
1182
1183 - def features_line(self, content):
1184 """Get ready for the feature table when we reach the FEATURE line. 1185 """ 1186 self.start_feature_table()
1187
1188 - def start_feature_table(self):
1189 """Signal the start of the feature table. 1190 """ 1191 # we need to add on the last reference 1192 if self._cur_reference is not None: 1193 self.data.references.append(self._cur_reference)
1194
1195 - def feature_key(self, content):
1196 """Grab the key of the feature and signal the start of a new feature. 1197 """ 1198 # first add on feature information if we've got any 1199 self._add_feature() 1200 1201 self._cur_feature = Record.Feature() 1202 self._cur_feature.key = content
1203
1204 - def _add_feature(self):
1205 """Utility function to add a feature to the Record. 1206 1207 This does all of the appropriate checking to make sure we haven't 1208 left any info behind, and that we are only adding info if it 1209 exists. 1210 """ 1211 if self._cur_feature is not None: 1212 # if we have a left over qualifier, add it to the qualifiers 1213 # on the current feature 1214 if self._cur_qualifier is not None: 1215 self._cur_feature.qualifiers.append(self._cur_qualifier) 1216 1217 self._cur_qualifier = None 1218 self.data.features.append(self._cur_feature)
1219
1220 - def location(self, content):
1221 self._cur_feature.location = self._clean_location(content)
1222
1223 - def feature_qualifier_name(self, content_list):
1224 """Deal with qualifier names 1225 1226 We receive a list of keys, since you can have valueless keys such as 1227 /pseudo which would be passed in with the next key (since no other 1228 tags separate them in the file) 1229 """ 1230 for content in content_list: 1231 # the record parser keeps the /s -- add them if we don't have 'em 1232 if content.find("/") != 0: 1233 content = "/%s" % content 1234 # add on a qualifier if we've got one 1235 if self._cur_qualifier is not None: 1236 self._cur_feature.qualifiers.append(self._cur_qualifier) 1237 1238 self._cur_qualifier = Record.Qualifier() 1239 self._cur_qualifier.key = content
1240
1241 - def feature_qualifier_description(self, content):
1242 # if we have info then the qualifier key should have a ='s 1243 if self._cur_qualifier.key.find("=") == -1: 1244 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1245 cur_content = self._remove_newlines(content) 1246 # remove all spaces from the value if it is a type where spaces 1247 # are not important 1248 for remove_space_key in self.__class__.remove_space_keys: 1249 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1250 cur_content = self._remove_spaces(cur_content) 1251 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1252
1253 - def base_count(self, content):
1254 self.data.base_counts = content
1255
1256 - def origin_name(self, content):
1257 self.data.origin = content
1258
1259 - def contig_location(self, content):
1260 """Signal that we have contig information to add to the record. 1261 """ 1262 self.data.contig = self._clean_location(content)
1263
1264 - def sequence(self, content):
1265 """Add sequence information to a list of sequence strings. 1266 1267 This removes spaces in the data and uppercases the sequence, and 1268 then adds it to a list of sequences. Later on we'll join this 1269 list together to make the final sequence. This is faster than 1270 adding on the new string every time. 1271 """ 1272 new_seq = content.replace(' ', '') 1273 self._seq_data.append(new_seq.upper())
1274
1275 - def record_end(self, content):
1276 """Signal the end of the record and do any necessary clean-up. 1277 """ 1278 # add together all of the sequence parts to create the 1279 # final sequence string 1280 self.data.sequence = "".join(self._seq_data) 1281 # add on the last feature 1282 self._add_feature()
1283 1284
1285 -class NCBIDictionary:
1286 """Access GenBank using a read-only dictionary interface (DEPRECATED). 1287 1288 This object is deprecated and will be removed in a future release of 1289 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1290 """ 1291 VALID_DATABASES = ['nucleotide', 'protein', 'genome'] 1292 VALID_FORMATS = ['genbank', 'fasta']
1293 - def __init__(self, database, format, parser = None):
1294 """Initialize an NCBI dictionary to retrieve sequences. 1295 1296 Create a new Dictionary to access GenBank. Valid values for 1297 database are 'nucleotide' and 'protein'. 1298 Valid values for format are 'genbank' (for nucleotide genbank and 1299 protein genpept) and 'fasta'. 1300 dely and retmax are old options kept only for compatibility -- do not 1301 bother to set them. 1302 parser is an optional parser object 1303 to change the results into another form. If unspecified, then 1304 the raw contents of the file will be returned. 1305 """ 1306 import warnings 1307 warnings.warn("Bio.GenBank.NCBIDictionary has been deprecated, and will be"\ 1308 " removed in a future release of Biopython. Please use"\ 1309 " Bio.Entrez instead which is described in the tutorial.", 1310 DeprecationWarning) 1311 1312 self.parser = parser 1313 if database not in self.__class__.VALID_DATABASES: 1314 raise ValueError("Invalid database %s, should be one of %s" % 1315 (database, self.__class__.VALID_DATABASES)) 1316 if format not in self.__class__.VALID_FORMATS: 1317 raise ValueError("Invalid format %s, should be one of %s" % 1318 (format, self.__class__.VALID_FORMATS)) 1319 1320 if format=="genbank": format = "gb" 1321 self.db = database 1322 self.format = format
1323
1324 - def __len__(self):
1325 raise NotImplementedError("GenBank contains lots of entries")
1326 - def clear(self):
1327 raise NotImplementedError("This is a read-only dictionary")
1328 - def __setitem__(self, key, item):
1329 raise NotImplementedError("This is a read-only dictionary")
1330 - def update(self):
1331 raise NotImplementedError("This is a read-only dictionary")
1332 - def copy(self):
1333 raise NotImplementedError("You don't need to do this...")
1334 - def keys(self):
1335 raise NotImplementedError("You don't really want to do this...")
1336 - def items(self):
1337 raise NotImplementedError("You don't really want to do this...")
1338 - def values(self):
1339 raise NotImplementedError("You don't really want to do this...")
1340
1341 - def has_key(self, id):
1342 """S.has_key(id) -> bool""" 1343 try: 1344 self[id] 1345 except KeyError: 1346 return 0 1347 return 1
1348
1349 - def get(self, id, failobj=None):
1350 try: 1351 return self[id] 1352 except KeyError: 1353 return failobj
1354
1355 - def __getitem__(self, id):
1356 """Return the GenBank entry specified by the GenBank ID. 1357 1358 Raises a KeyError if there's an error. 1359 """ 1360 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format) 1361 # Parse the record if a parser was passed in. 1362 if self.parser is not None: 1363 return self.parser.parse(handle) 1364 return handle.read()
1365
1366 -def search_for(search, database='nucleotide', 1367 reldate=None, mindate=None, maxdate=None, 1368 start_id = 0, max_ids = 50000000):
1369 """Do an online search at the NCBI, returns a list of IDs (DEPRECATED). 1370 1371 This function is deprecated and will be removed in a future release of 1372 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1373 1374 Search GenBank and return a list of the GenBank identifiers (gi's) 1375 that match the criteria. search is the search string used to 1376 search the database. Valid values for database are 1377 'nucleotide', 'protein', 'popset' and 'genome'. reldate is 1378 the number of dates prior to the current date to restrict the 1379 search. mindate and maxdate are the dates to restrict the search, 1380 e.g. 2002/12/20. start_id is the number to begin retrieval on. 1381 max_ids specifies the maximum number of id's to retrieve. 1382 """ 1383 import warnings 1384 warnings.warn("Bio.GenBank.search_for has been deprecated, and will be"\ 1385 " removed in a future release of Biopython. Please use"\ 1386 " Bio.Entrez instead which is described in the tutorial.", 1387 DeprecationWarning) 1388 1389 # mindate and maxdate are NCBI parameters in "YYYY/MM/DD" format 1390 # (and both should be supplied or neither) 1391 # relate is an NCBI parameter for "within N days" 1392 1393 #Following date checking from Bio/EUtils/Datatypes.py, 1394 import re 1395 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match 1396 errinfo = None 1397 if mindate is not None and _date_re_match(mindate) is None: 1398 errinfo = ("mindate", mindate) 1399 elif maxdate is not None and _date_re_match(maxdate) is None: 1400 errinfo = ("maxdate", maxdate) 1401 if errinfo: 1402 raise TypeError( 1403 "%s is not in YYYY/MM/DD format (month and " 1404 "day are optional): %r" % errinfo) 1405 1406 #Bio.Entrez can now ignore None arguments automatically 1407 handle = Entrez.esearch(database, search, retmode="xml", 1408 retstart=start_id, retmax=max_ids, 1409 mindate=mindate, maxdate=maxdate, 1410 reldate=reldate) 1411 return Entrez.read(handle)["IdList"]
1412
1413 -def download_many(ids, database = 'nucleotide'):
1414 """Download multiple NCBI GenBank records, returned as a handle (DEPRECATED). 1415 1416 This function is deprecated and will be removed in a future release of 1417 Biopython. Please use Bio.Entrez instead as described in the tutorial. 1418 1419 Download many records from GenBank. ids is a list of gis or 1420 accessions. 1421 """ 1422 import warnings 1423 warnings.warn("Bio.GenBank.download_many has been deprecated, and will be"\ 1424 " removed in a future release of Biopython. Please use"\ 1425 " Bio.Entrez instead which is described in the tutorial.", 1426 DeprecationWarning) 1427 1428 if database in ['nucleotide']: 1429 format = 'gb' 1430 elif database in ['protein']: 1431 format = 'gp' 1432 else: 1433 raise ValueError("Unexpected database: %s" % database) 1434 #TODO - Batch the queries? 1435 result_handle = Entrez.efetch(database, 1436 id=",".join(ids), 1437 retmode = "text", 1438 rettype = format) 1439 return cStringIO.StringIO(result_handle.read())
1440