Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007 by Peter Cock.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # This code is NOT intended for direct use.  It provides a basic scanner 
   7  # (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
   8  # to parse a GenBank or EMBL file (with their shared INSDC feature table). 
   9  # 
  10  # It is used by Bio.GenBank to parse GenBank files 
  11  # It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  12  # 
  13  # Feature Table Documentation: 
  14  # http://www.insdc.org/files/feature_table.html 
  15  # http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  16  # ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  17  # 
  18  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  19  # These are GenBank files that summarize the content of a project, and provide lists of 
  20  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  21  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  22  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  23  # http://is.gd/nNgk 
  24  # for more details of this format, and an example. 
  25  # Added by Ying Huang & Iddo Friedberg 
  26   
  27  import sys 
  28  import os 
  29  from Bio.Seq import Seq 
  30  from Bio.SeqRecord import SeqRecord 
  31  from Bio.Alphabet import generic_alphabet, generic_protein 
  32   
33 -class InsdcScanner :
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 35 36 The International Nucleotide Sequence Database Collaboration (INSDC) 37 between the DDBJ, EMBL, and GenBank. These organisations all use the 38 same "Feature Table" layout in their plain text flat file formats. 39 40 However, the header and sequence sections of an EMBL file are very 41 different in layout to those produced by GenBank/DDBJ.""" 42 43 #These constants get redefined with sensible values in the sub classes: 44 RECORD_START = "XXX" # "LOCUS " or "ID " 45 HEADER_WIDTH = 3 # 12 or 5 46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 48 FEATURE_QUALIFIER_INDENT = 0 49 FEATURE_QUALIFIER_SPACER = "" 50 SEQUENCE_HEADERS=["XXX"] #with right hand side spaces removed 51
52 - def __init__(self, debug=0) :
53 assert len(self.RECORD_START)==self.HEADER_WIDTH 54 for marker in self.SEQUENCE_HEADERS : 55 assert marker==marker.rstrip() 56 assert len(self.FEATURE_QUALIFIER_SPACER)==self.FEATURE_QUALIFIER_INDENT 57 self.debug = debug 58 self.line = None
59
60 - def set_handle(self, handle) :
61 self.handle = handle 62 self.line = ""
63
64 - def find_start(self) :
65 """Read in lines until find the ID/LOCUS line, which is returned. 66 67 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 68 will we ignored.""" 69 while True : 70 if self.line : 71 line = self.line 72 self.line = "" 73 else : 74 line = self.handle.readline() 75 if not line : 76 if self.debug : print "End of file" 77 return None 78 if line[:self.HEADER_WIDTH]==self.RECORD_START : 79 if self.debug > 1: print "Found the start of a record:\n" + line 80 break 81 line = line.rstrip() 82 if line == "//" : 83 if self.debug > 1: print "Skipping // marking end of last record" 84 elif line == "" : 85 if self.debug > 1: print "Skipping blank line before record" 86 else : 87 #Ignore any header before the first ID/LOCUS line. 88 if self.debug > 1: 89 print "Skipping header line before record:\n" + line 90 self.line = line 91 return line
92
93 - def parse_header(self) :
94 """Return list of strings making up the header 95 96 New line characters are removed. 97 98 Assumes you have just read in the ID/LOCUS line. 99 """ 100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \ 101 "Not at start of record" 102 103 header_lines = [] 104 while True : 105 line = self.handle.readline() 106 if not line : 107 raise ValueError("Premature end of line during sequence data") 108 line = line.rstrip() 109 if line in self.FEATURE_START_MARKERS : 110 if self.debug : print "Found header table" 111 break 112 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH] : 113 # if self.debug : print "Found header table (?)" 114 # break 115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS : 116 if self.debug : print "Found start of sequence" 117 break 118 if line == "//" : 119 raise ValueError("Premature end of sequence data marker '//' found") 120 header_lines.append(line) 121 self.line = line 122 return header_lines
123
124 - def parse_features(self, skip=False) :
125 """Return list of tuples for the features (if present) 126 127 Each feature is returned as a tuple (key, location, qualifiers) 128 where key and location are strings (e.g. "CDS" and 129 "complement(join(490883..490885,1..879))") while qualifiers 130 is a list of two string tuples (feature qualifier keys and values). 131 132 Assumes you have already read to the start of the features table. 133 """ 134 if self.line.rstrip() not in self.FEATURE_START_MARKERS : 135 if self.debug : print "Didn't find any feature table" 136 return [] 137 138 while self.line.rstrip() in self.FEATURE_START_MARKERS : 139 self.line = self.handle.readline() 140 141 features = [] 142 line = self.line 143 while True : 144 if not line : 145 raise ValueError("Premature end of line during features table") 146 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS : 147 if self.debug : print "Found start of sequence" 148 break 149 line = line.rstrip() 150 if line == "//" : 151 raise ValueError("Premature end of features table, marker '//' found") 152 if line in self.FEATURE_END_MARKERS : 153 if self.debug : print "Found end of features" 154 line = self.handle.readline() 155 break 156 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "" : 157 raise ValueError("Expected a feature qualifier in line '%s'" % line) 158 159 if skip : 160 line = self.handle.readline() 161 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER : 162 line = self.handle.readline() 163 else : 164 #Build up a list of the lines making up this feature: 165 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 166 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 167 line = self.handle.readline() 168 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 169 or line.rstrip() == "" : # cope with blank lines in the midst of a feature 170 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].rstrip()) 171 line = self.handle.readline() 172 features.append(self.parse_feature(feature_key, feature_lines)) 173 self.line = line 174 return features
175
176 - def parse_feature(self, feature_key, lines) :
177 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 178 179 For example given this GenBank feature: 180 181 CDS complement(join(490883..490885,1..879)) 182 /locus_tag="NEQ001" 183 /note="conserved hypothetical [Methanococcus jannaschii]; 184 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 185 localization signal; IPR002743: Protein of unknown 186 function DUF57" 187 /codon_start=1 188 /transl_table=11 189 /product="hypothetical protein" 190 /protein_id="NP_963295.1" 191 /db_xref="GI:41614797" 192 /db_xref="GeneID:2732620" 193 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 194 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 195 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 196 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 197 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 198 LNSMGFGFVNTKKNSAR" 199 200 Then should give input key="CDS" and the rest of the data as a list of strings 201 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 202 where the leading spaces and trailing newlines have been removed. 203 204 Returns tuple containing: (key as string, location string, qualifiers as list) 205 as follows for this example: 206 207 key = "CDS", string 208 location = "complement(join(490883..490885,1..879))", string 209 qualifiers = list of string tuples: 210 211 [('locus_tag', '"NEQ001"'), 212 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 213 ('codon_start', '1'), 214 ('transl_table', '11'), 215 ('product', '"hypothetical protein"'), 216 ('protein_id', '"NP_963295.1"'), 217 ('db_xref', '"GI:41614797"'), 218 ('db_xref', '"GeneID:2732620"'), 219 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 220 221 In the above example, the "note" and "translation" were edited for compactness, 222 and they would contain multiple new line characters (displayed above as \n) 223 224 If a qualifier is quoted (in this case, everything except codon_start and 225 transl_table) then the quotes are NOT removed. 226 227 Note that no whitespace is removed. 228 """ 229 #Skip any blank lines 230 iterator = iter(filter(None, lines)) 231 try : 232 line = iterator.next() 233 234 feature_location = line.strip() 235 while feature_location[-1:]=="," : 236 #Multiline location, still more to come! 237 feature_location += iterator.next().strip() 238 239 qualifiers=[] 240 241 for line in iterator : 242 if line[0]=="/" : 243 #New qualifier 244 i = line.find("=") 245 key = line[1:i] #does not work if i==-1 246 value = line[i+1:] #we ignore 'value' if i==-1 247 if i==-1 : 248 #Qualifier with no key, e.g. /pseudo 249 key = line[1:] 250 qualifiers.append((key,None)) 251 elif value[0]=='"' : 252 #Quoted... 253 if value[-1]!='"' or value!='"' : 254 #No closing quote on the first line... 255 while value[-1] != '"' : 256 value += "\n" + iterator.next() 257 else : 258 #One single line (quoted) 259 assert value == '"' 260 if self.debug : print "Quoted line %s:%s" % (key, value) 261 #DO NOT remove the quotes... 262 qualifiers.append((key,value)) 263 else : 264 #Unquoted 265 #if debug : print "Unquoted line %s:%s" % (key,value) 266 qualifiers.append((key,value)) 267 else : 268 #Unquoted continuation 269 assert len(qualifiers) > 0 270 assert key==qualifiers[-1][0] 271 #if debug : print "Unquoted Cont %s:%s" % (key, line) 272 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 273 return (feature_key, feature_location, qualifiers) 274 except StopIteration: 275 #Bummer 276 raise ValueError("Problem with '%s' feature:\n%s" \ 277 % (feature_key, "\n".join(lines)))
278 299
300 - def _feed_first_line(self, consumer, line) :
301 """Handle the LOCUS/ID line, passing data to the comsumer 302 303 This should be implemented by the EMBL / GenBank specific subclass 304 305 Used by the parse_records() and parse() methods. 306 """ 307 pass
308
309 - def _feed_header_lines(self, consumer, lines) :
310 """Handle the header lines (list of strings), passing data to the comsumer 311 312 This should be implemented by the EMBL / GenBank specific subclass 313 314 Used by the parse_records() and parse() methods. 315 """ 316 pass
317 318
319 - def _feed_feature_table(self, consumer, feature_tuples) :
320 """Handle the feature table (list of tuples), passing data to the comsumer 321 322 Used by the parse_records() and parse() methods. 323 """ 324 consumer.start_feature_table() 325 for feature_key, location_string, qualifiers in feature_tuples : 326 consumer.feature_key(feature_key) 327 consumer.location(location_string) 328 for q_key, q_value in qualifiers : 329 consumer.feature_qualifier_name([q_key]) 330 if q_value is not None : 331 consumer.feature_qualifier_description(q_value.replace("\n"," "))
332
333 - def _feed_misc_lines(self, consumer, lines) :
334 """Handle any lines between features and sequence (list of strings), passing data to the consumer 335 336 This should be implemented by the EMBL / GenBank specific subclass 337 338 Used by the parse_records() and parse() methods. 339 """ 340 pass
341
342 - def feed(self, handle, consumer, do_features=True) :
343 """Feed a set of data into the consumer. 344 345 This method is intended for use with the "old" code in Bio.GenBank 346 347 Arguments: 348 handle - A handle with the information to parse. 349 consumer - The consumer that should be informed of events. 350 do_features - Boolean, should the features be parsed? 351 Skipping the features can be much faster. 352 353 Return values: 354 true - Passed a record 355 false - Did not find a record 356 """ 357 #Should work with both EMBL and GenBank files provided the 358 #equivalent Bio.GenBank._FeatureConsumer methods are called... 359 self.set_handle(handle) 360 if not self.find_start() : 361 #Could not find (another) record 362 consumer.data=None 363 return False 364 365 #We use the above class methods to parse the file into a simplified format. 366 #The first line, header lines and any misc lines after the features will be 367 #dealt with by GenBank / EMBL specific derived classes. 368 369 #First line and header: 370 self._feed_first_line(consumer, self.line) 371 self._feed_header_lines(consumer, self.parse_header()) 372 373 #Features (common to both EMBL and GenBank): 374 if do_features : 375 self._feed_feature_table(consumer, self.parse_features(skip=False)) 376 else : 377 self.parse_features(skip=True) # ignore the data 378 379 #Footer and sequence 380 misc_lines, sequence_string = self.parse_footer() 381 self._feed_misc_lines(consumer, misc_lines) 382 383 consumer.sequence(sequence_string) 384 #Calls to consumer.base_number() do nothing anyway 385 consumer.record_end("//") 386 387 assert self.line == "//" 388 389 #And we are done 390 return True
391
392 - def parse(self, handle, do_features=True) :
393 """Returns a SeqRecord (with SeqFeatures if do_features=True) 394 395 See also the method parse_records() for use on multi-record files. 396 """ 397 from Bio.GenBank import _FeatureConsumer 398 from Bio.GenBank.utils import FeatureValueCleaner 399 400 consumer = _FeatureConsumer(use_fuzziness = 1, 401 feature_cleaner = FeatureValueCleaner()) 402 403 if self.feed(handle, consumer) : 404 return consumer.data 405 else : 406 return None
407 408
409 - def parse_records(self, handle, do_features=True) :
410 """Returns a SeqRecord object iterator 411 412 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 413 414 The SeqRecord objects include SeqFeatures if do_features=True 415 416 This method is intended for use in Bio.SeqIO 417 """ 418 #This is a generator function 419 while True : 420 record = self.parse(handle) 421 if record is None : break 422 assert record.id is not None 423 assert record.name != "<unknown name>" 424 assert record.description != "<unknown description>" 425 yield record
426
427 - def parse_cds_features(self, handle, 428 alphabet=generic_protein, 429 tags2id=('protein_id','locus_tag','product')) :
430 """Returns SeqRecord object iterator 431 432 Each CDS feature becomes a SeqRecord. 433 434 alphabet - Used for any sequence found in a translation field. 435 tags2id - Tupple of three strings, the feature keys to use 436 for the record id, name and description, 437 438 This method is intended for use in Bio.SeqIO 439 """ 440 self.set_handle(handle) 441 while self.find_start() : 442 #Got an EMBL or GenBank record... 443 self.parse_header() # ignore header lines! 444 feature_tuples = self.parse_features() 445 #self.parse_footer() # ignore footer lines! 446 while True : 447 line = self.handle.readline() 448 if not line : break 449 if line[:2]=="//" : break 450 self.line = line.rstrip() 451 452 #Now go though those features... 453 for key, location_string, qualifiers in feature_tuples : 454 if key=="CDS" : 455 #Create SeqRecord 456 #================ 457 #SeqRecord objects cannot be created with annotations, they 458 #must be added afterwards. So create an empty record and 459 #then populate it: 460 record = SeqRecord(seq=None) 461 annotations = record.annotations 462 463 #Should we add a location object to the annotations? 464 #I *think* that only makes sense for SeqFeatures with their 465 #sub features... 466 annotations['raw_location'] = location_string.replace(' ','') 467 468 for (qualifier_name, qualifier_data) in qualifiers : 469 if qualifier_data is not None \ 470 and qualifier_data[0]=='"' and qualifier_data[-1]=='"' : 471 #Remove quotes 472 qualifier_data = qualifier_data[1:-1] 473 #Append the data to the annotation qualifier... 474 if qualifier_name == "translation" : 475 assert record.seq is None, "Multiple translations!" 476 record.seq = Seq(qualifier_data.replace("\n",""), alphabet) 477 elif qualifier_name == "db_xref" : 478 #its a list, possibly empty. Its safe to extend 479 record.dbxrefs.append(qualifier_data) 480 else : 481 if qualifier_data is not None : 482 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ") 483 try : 484 annotations[qualifier_name] += " " + qualifier_data 485 except KeyError : 486 #Not an addition to existing data, its the first bit 487 annotations[qualifier_name]= qualifier_data 488 489 #Fill in the ID, Name, Description 490 #================================= 491 try : 492 record.id = annotations[tags2id[0]] 493 except KeyError : 494 pass 495 try : 496 record.name = annotations[tags2id[1]] 497 except KeyError : 498 pass 499 try : 500 record.description = annotations[tags2id[2]] 501 except KeyError : 502 pass 503 504 yield record
505
506 -class EmblScanner(InsdcScanner) :
507 """For extracting chunks of information in EMBL files""" 508 509 RECORD_START = "ID " 510 HEADER_WIDTH = 5 511 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"] 512 FEATURE_END_MARKERS = ["XX"] #XX can also mark the end of many things! 513 FEATURE_QUALIFIER_INDENT = 21 514 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2) 515 SEQUENCE_HEADERS=["SQ"] #Remove trailing spaces 516 550
551 - def _feed_first_line(self, consumer, line) :
552 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 553 if line[self.HEADER_WIDTH:].count(";") == 6 : 554 #Looks like the semi colon separated style introduced in 2006 555 self._feed_first_line_new(consumer, line) 556 elif line[self.HEADER_WIDTH:].count(";") == 3 : 557 #Looks like the pre 2006 style 558 self._feed_first_line_old(consumer, line) 559 else : 560 raise ValueError('Did not recognise the ID line layout:\n' + line)
561
562 - def _feed_first_line_old(self, consumer, line) :
563 #Expects an ID line in the style before 2006, e.g. 564 #ID SC10H5 standard; DNA; PRO; 4870 BP. 565 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 566 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 567 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]] 568 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";")) 569 fields = [entry.strip() for entry in fields] 570 """ 571 The tokens represent: 572 0. Primary accession number 573 (space sep) 574 1. ??? (e.g. standard) 575 (semi-colon) 576 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 577 3. Taxonomic division (e.g. 'PRO') 578 4. Sequence length (e.g. '4639675 BP.') 579 """ 580 consumer.locus(fields[0]) #Should we also call the accession consumer? 581 consumer.residue_type(fields[2]) 582 consumer.data_file_division(fields[3]) 583 self._feed_seq_length(consumer, fields[4])
584
585 - def _feed_first_line_new(self, consumer, line) :
586 #Expects an ID line in the style introduced in 2006, e.g. 587 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 588 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 589 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 590 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 591 assert len(fields) == 7 592 """ 593 The tokens represent: 594 0. Primary accession number 595 1. Sequence version number 596 2. Topology: 'circular' or 'linear' 597 3. Molecule type (e.g. 'genomic DNA') 598 4. Data class (e.g. 'STD') 599 5. Taxonomic division (e.g. 'PRO') 600 6. Sequence length (e.g. '4639675 BP.') 601 """ 602 603 consumer.locus(fields[0]) 604 605 #Call the accession consumer now, to make sure we record 606 #something as the record.id, in case there is no AC line 607 consumer.accession(fields[0]) 608 609 #TODO - How to deal with the version field? At the moment the consumer 610 #will try and use this for the ID which isn't ideal for EMBL files. 611 version_parts = fields[1].split() 612 if len(version_parts)==2 \ 613 and version_parts[0]=="SV" \ 614 and version_parts[1].isdigit() : 615 consumer.version_suffix(version_parts[1]) 616 617 #Based on how the old GenBank parser worked, merge these two: 618 consumer.residue_type(" ".join(fields[2:4])) #TODO - Store as two fields? 619 620 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 621 622 consumer.data_file_division(fields[5]) 623 624 self._feed_seq_length(consumer, fields[6])
625
626 - def _feed_seq_length(self, consumer, text) :
627 length_parts = text.split() 628 assert len(length_parts) == 2 629 assert length_parts[1].upper() in ["BP", "BP."] 630 consumer.size(length_parts[0])
631
632 - def _feed_header_lines(self, consumer, lines) :
633 EMBL_INDENT = self.HEADER_WIDTH 634 EMBL_SPACER = " " * EMBL_INDENT 635 consumer_dict = { 636 'AC' : 'accession', 637 'SV' : 'version', # SV line removed in June 2006, now part of ID line 638 'DE' : 'definition', 639 #'RN' : 'reference_num', 640 #'RP' : 'reference_bases', 641 #'RX' : reference cross reference... DOI or Pubmed 642 'RA' : 'authors', 643 'RT' : 'title', 644 'RL' : 'journal', 645 'OS' : 'organism', 646 'OC' : 'taxonomy', 647 #'DR' : data reference? 648 'CC' : 'comment', 649 #'XX' : splitter 650 } 651 #We have to handle the following specially: 652 #RX (depending on reference type...) 653 lines = filter(None,lines) 654 line_iter = iter(lines) 655 try : 656 while True : 657 try : 658 line = line_iter.next() 659 except StopIteration : 660 break 661 if not line : break 662 line_type = line[:EMBL_INDENT].strip() 663 data = line[EMBL_INDENT:].strip() 664 665 if line_type == 'XX' : 666 pass 667 elif line_type == 'RN' : 668 # Reformat reference numbers for the GenBank based consumer 669 # e.g. '[1]' becomes '1' 670 if data[0] == "[" and data[-1] == "]" : data = data[1:-1] 671 consumer.reference_num(data) 672 elif line_type == 'RP' : 673 # Reformat reference numbers for the GenBank based consumer 674 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 675 assert data.count("-")==1 676 consumer.reference_bases("(bases " + data.replace("-", " to ") + ")") 677 elif line_type == 'RX' : 678 # EMBL support three reference types at the moment: 679 # - PUBMED PUBMED bibliographic database (NLM) 680 # - DOI Digital Object Identifier (International DOI Foundation) 681 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 682 # of Agriculture (USDA) 683 # 684 # Format: 685 # RX resource_identifier; identifier. 686 # 687 # e.g. 688 # RX DOI; 10.1016/0024-3205(83)90010-3. 689 # RX PUBMED; 264242. 690 # 691 # Currently our reference object only supports PUBMED and MEDLINE 692 # (as these were in GenBank files?). 693 key, value = data.split(";",1) 694 if value.endswith(".") : value = value[:-1] 695 value = value.strip() 696 if key == "PUBMED" : 697 consumer.pubmed_id(value) 698 #TODO - Handle other reference types (here and in BioSQL bindings) 699 elif line_type == 'CC' : 700 # Have to pass a list of strings for this one (not just a string) 701 consumer.comment([data]) 702 elif line_type == 'DR' : 703 # Database Cross-reference, format: 704 # DR database_identifier; primary_identifier; secondary_identifier. 705 # 706 # e.g. 707 # DR MGI; 98599; Tcrb-V4. 708 # 709 # TODO - Data reference... 710 # How should we store the secondary identifier (if present)? Ignore it? 711 pass 712 elif line_type in consumer_dict : 713 #Its a semi-automatic entry! 714 getattr(consumer, consumer_dict[line_type])(data) 715 else : 716 if self.debug : 717 print "Ignoring EMBL header line:\n%s" % line 718 except StopIteration : 719 raise ValueError("Problem with header")
720
721 - def _feed_misc_lines(self, consumer, lines) :
722 #TODO - Should we do something with the information on the SQ line(s)? 723 pass
724
725 -class GenBankScanner(InsdcScanner) :
726 """For extracting chunks of information in GenBank files""" 727 728 RECORD_START = "LOCUS " 729 HEADER_WIDTH = 12 730 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"] 731 FEATURE_END_MARKERS = [] 732 FEATURE_QUALIFIER_INDENT = 21 733 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 734 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 735 778
779 - def _feed_first_line(self, consumer, line) :
780 ##################################### 781 # LOCUS line # 782 ##################################### 783 GENBANK_INDENT = self.HEADER_WIDTH 784 GENBANK_SPACER = " "*GENBANK_INDENT 785 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 786 'LOCUS line does not start correctly:\n' + line 787 788 #Have to break up the locus line, and handle the different bits of it. 789 #There are at least two different versions of the locus line... 790 if line[29:33] in [' bp ', ' aa ',' rc '] : 791 #Old... 792 # 793 # Positions Contents 794 # --------- -------- 795 # 00:06 LOCUS 796 # 06:12 spaces 797 # 12:?? Locus name 798 # ??:?? space 799 # ??:29 Length of sequence, right-justified 800 # 29:33 space, bp, space 801 # 33:41 strand type 802 # 41:42 space 803 # 42:51 Blank (implies linear), linear or circular 804 # 51:52 space 805 # 52:55 The division code (e.g. BCT, VRL, INV) 806 # 55:62 space 807 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 808 # 809 assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 810 'LOCUS line does not contain size units at expected position:\n' + line 811 assert line[41:42] == ' ', \ 812 'LOCUS line does not contain space at position 42:\n' + line 813 assert line[42:51].strip() in ['','linear','circular'], \ 814 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 815 assert line[51:52] == ' ', \ 816 'LOCUS line does not contain space at position 52:\n' + line 817 assert line[55:62] == ' ', \ 818 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 819 assert line[64:65] == '-', \ 820 'LOCUS line does not contain - at position 65 in date:\n' + line 821 assert line[68:69] == '-', \ 822 'LOCUS line does not contain - at position 69 in date:\n' + line 823 824 name_and_length_str = line[GENBANK_INDENT:29] 825 while name_and_length_str.find(' ')!=-1 : 826 name_and_length_str = name_and_length_str.replace(' ',' ') 827 name_and_length = name_and_length_str.split(' ') 828 assert len(name_and_length)<=2, \ 829 'Cannot parse the name and length in the LOCUS line:\n' + line 830 assert len(name_and_length)!=1, \ 831 'Name and length collide in the LOCUS line:\n' + line 832 #Should be possible to split them based on position, if 833 #a clear definition of the standard exists THAT AGREES with 834 #existing files. 835 consumer.locus(name_and_length[0]) 836 consumer.size(name_and_length[1]) 837 #consumer.residue_type(line[33:41].strip()) 838 839 if line[33:51].strip() == "" and line[29:33] == ' aa ' : 840 #Amino acids -> protein (even if there is no residue type given) 841 #We want to use a protein alphabet in this case, rather than a 842 #generic one. Not sure if this is the best way to achieve this, 843 #but it works because the scanner checks for this: 844 consumer.residue_type("PROTEIN") 845 else : 846 consumer.residue_type(line[33:51].strip()) 847 848 consumer.data_file_division(line[52:55]) 849 consumer.date(line[62:73]) 850 elif line[40:44] in [' bp ', ' aa ',' rc '] : 851 #New... 852 # 853 # Positions Contents 854 # --------- -------- 855 # 00:06 LOCUS 856 # 06:12 spaces 857 # 12:?? Locus name 858 # ??:?? space 859 # ??:40 Length of sequence, right-justified 860 # 40:44 space, bp, space 861 # 44:47 Blank, ss-, ds-, ms- 862 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 863 # 54:55 space 864 # 55:63 Blank (implies linear), linear or circular 865 # 63:64 space 866 # 64:67 The division code (e.g. BCT, VRL, INV) 867 # 67:68 space 868 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 869 # 870 assert line[40:44] in [' bp ', ' aa ',' rc '] , \ 871 'LOCUS line does not contain size units at expected position:\n' + line 872 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 873 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 874 assert line[47:54].strip() == "" \ 875 or line[47:54].strip().find('DNA') != -1 \ 876 or line[47:54].strip().find('RNA') != -1, \ 877 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 878 assert line[54:55] == ' ', \ 879 'LOCUS line does not contain space at position 55:\n' + line 880 assert line[55:63].strip() in ['','linear','circular'], \ 881 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 882 assert line[63:64] == ' ', \ 883 'LOCUS line does not contain space at position 64:\n' + line 884 assert line[67:68] == ' ', \ 885 'LOCUS line does not contain space at position 68:\n' + line 886 assert line[70:71] == '-', \ 887 'LOCUS line does not contain - at position 71 in date:\n' + line 888 assert line[74:75] == '-', \ 889 'LOCUS line does not contain - at position 75 in date:\n' + line 890 891 name_and_length_str = line[GENBANK_INDENT:40] 892 while name_and_length_str.find(' ')!=-1 : 893 name_and_length_str = name_and_length_str.replace(' ',' ') 894 name_and_length = name_and_length_str.split(' ') 895 assert len(name_and_length)<=2, \ 896 'Cannot parse the name and length in the LOCUS line:\n' + line 897 assert len(name_and_length)!=1, \ 898 'Name and length collide in the LOCUS line:\n' + line 899 #Should be possible to split them based on position, if 900 #a clear definition of the stand exists THAT AGREES with 901 #existing files. 902 consumer.locus(name_and_length[0]) 903 consumer.size(name_and_length[1]) 904 905 if line[44:54].strip() == "" and line[40:44] == ' aa ' : 906 #Amino acids -> protein (even if there is no residue type given) 907 #We want to use a protein alphabet in this case, rather than a 908 #generic one. Not sure if this is the best way to achieve this, 909 #but it works because the scanner checks for this: 910 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 911 else : 912 consumer.residue_type(line[44:63].strip()) 913 914 consumer.data_file_division(line[64:67]) 915 consumer.date(line[68:79]) 916 elif line[GENBANK_INDENT:].strip().count(" ")==0 : 917 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 918 # 919 #e.g. 920 # 921 # "LOCUS U00096" 922 # 923 #rather than: 924 # 925 # "LOCUS U00096 4639675 bp DNA circular BCT" 926 # 927 # Positions Contents 928 # --------- -------- 929 # 00:06 LOCUS 930 # 06:12 spaces 931 # 12:?? Locus name 932 if line[GENBANK_INDENT:].strip() != "" : 933 consumer.locus(line[GENBANK_INDENT:].strip()) 934 else : 935 #Must just have just "LOCUS ", is this even legitimate? 936 #We should be able to continue parsing... we need real world testcases! 937 print >> sys.stderr, "Warning: Minimal LOCUS line found - is this correct?\n" + line 938 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"] : 939 #Cope with EMBOSS seqret output where it seems the locus id can cause 940 #the other fields to overflow. We just IGNORE the other fields! 941 consumer.locus(line.split()[1]) 942 consumer.size(line.split()[2]) 943 print >> sys.stderr, "Warning: Malformed LOCUS line found - is this correct?\n" + line 944 else : 945 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
946 947
948 - def _feed_header_lines(self, consumer, lines) :
949 #Following dictionary maps GenBank lines to the associated 950 #consumer methods - the special cases like LOCUS where one 951 #genbank line triggers several consumer calls have to be 952 #handled individually. 953 GENBANK_INDENT = self.HEADER_WIDTH 954 GENBANK_SPACER = " "*GENBANK_INDENT 955 consumer_dict = { 956 'DEFINITION' : 'definition', 957 'ACCESSION' : 'accession', 958 'NID' : 'nid', 959 'PID' : 'pid', 960 'DBSOURCE' : 'db_source', 961 'KEYWORDS' : 'keywords', 962 'SEGMENT' : 'segment', 963 'SOURCE' : 'source', 964 'AUTHORS' : 'authors', 965 'CONSRTM' : 'consrtm', 966 'PROJECT' : 'project', 967 'DBLINK' : 'dblink', 968 'TITLE' : 'title', 969 'JOURNAL' : 'journal', 970 'MEDLINE' : 'medline_id', 971 'PUBMED' : 'pubmed_id', 972 'REMARK' : 'remark'} 973 #We have to handle the following specially: 974 #ORIGIN (locus, size, residue_type, data_file_division and date) 975 #COMMENT (comment) 976 #VERSION (version and gi) 977 #REFERENCE (eference_num and reference_bases) 978 #ORGANISM (organism and taxonomy) 979 lines = filter(None,lines) 980 lines.append("") #helps avoid getting StopIteration all the time 981 line_iter = iter(lines) 982 try : 983 line = line_iter.next() 984 while True : 985 if not line : break 986 line_type = line[:GENBANK_INDENT].strip() 987 data = line[GENBANK_INDENT:].strip() 988 989 if line_type == 'VERSION' : 990 #Need to call consumer.version(), and maybe also consumer.gi() as well. 991 #e.g. 992 # VERSION AC007323.5 GI:6587720 993 while data.find(' ')!=-1: 994 data = data.replace(' ',' ') 995 if data.find(' GI:')==-1 : 996 consumer.version(data) 997 else : 998 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 999 consumer.version(data.split(' GI:')[0]) 1000 consumer.gi(data.split(' GI:')[1]) 1001 #Read in the next line! 1002 line = line_iter.next() 1003 elif line_type == 'REFERENCE' : 1004 if self.debug >1 : print "Found reference [" + data + "]" 1005 #Need to call consumer.reference_num() and consumer.reference_bases() 1006 #e.g. 1007 # REFERENCE 1 (bases 1 to 86436) 1008 # 1009 #Note that this can be multiline, see Bug 1968, e.g. 1010 # 1011 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1012 # 28259) 1013 # 1014 #For such cases we will call the consumer once only. 1015 data = data.strip() 1016 1017 #Read in the next line, and see if its more of the reference: 1018 while True: 1019 line = line_iter.next() 1020 if line[:GENBANK_INDENT] == GENBANK_SPACER : 1021 #Add this continuation to the data string 1022 data += " " + line[GENBANK_INDENT:] 1023 if self.debug >1 : print "Extended reference text [" + data + "]" 1024 else : 1025 #End of the reference, leave this text in the variable "line" 1026 break 1027 1028 #We now have all the reference line(s) stored in a string, data, 1029 #which we pass to the consumer 1030 while data.find(' ')!=-1: 1031 data = data.replace(' ',' ') 1032 if data.find(' ')==-1 : 1033 if self.debug >2 : print 'Reference number \"' + data + '\"' 1034 consumer.reference_num(data) 1035 else : 1036 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"' 1037 consumer.reference_num(data[:data.find(' ')]) 1038 consumer.reference_bases(data[data.find(' ')+1:]) 1039 elif line_type == 'ORGANISM' : 1040 #Typically the first line is the organism, and subsequent lines 1041 #are the taxonomy lineage. However, given longer and longer 1042 #species names (as more and more strains and sub strains get 1043 #sequenced) the oragnism name can now get wrapped onto multiple 1044 #lines. The NCBI say we have to recognise the lineage line by 1045 #the presense of semi-colon delimited entries. In the long term, 1046 #they are considering adding a new keyword (e.g. LINEAGE). 1047 #See Bug 2591 for details. 1048 organism_data = data 1049 lineage_data = "" 1050 while True : 1051 line = line_iter.next() 1052 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1053 if lineage_data or ";" in line : 1054 lineage_data += " " + line[GENBANK_INDENT:] 1055 else : 1056 organism_data += " " + line[GENBANK_INDENT:].strip() 1057 else : 1058 #End of organism and taxonomy 1059 break 1060 consumer.organism(organism_data) 1061 if lineage_data.strip() == "" and self.debug > 1 : 1062 print "Taxonomy line(s) missing or blank" 1063 consumer.taxonomy(lineage_data.strip()) 1064 del organism_data, lineage_data 1065 elif line_type == 'COMMENT' : 1066 if self.debug > 1 : print "Found comment" 1067 #This can be multiline, and should call consumer.comment() once 1068 #with a list where each entry is a line. 1069 comment_list=[] 1070 comment_list.append(data) 1071 while True: 1072 line = line_iter.next() 1073 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1074 data = line[GENBANK_INDENT:] 1075 comment_list.append(data) 1076 if self.debug > 2 : print "Comment continuation [" + data + "]" 1077 else : 1078 #End of the comment 1079 break 1080 consumer.comment(comment_list) 1081 del comment_list 1082 elif line_type in consumer_dict : 1083 #Its a semi-automatic entry! 1084 #Now, this may be a multi line entry... 1085 while True : 1086 line = line_iter.next() 1087 if line[0:GENBANK_INDENT] == GENBANK_SPACER : 1088 data += ' ' + line[GENBANK_INDENT:] 1089 else : 1090 #We now have all the data for this entry: 1091 getattr(consumer, consumer_dict[line_type])(data) 1092 #End of continuation - return to top of loop! 1093 break 1094 else : 1095 if self.debug : 1096 print "Ignoring GenBank header line:\n" % line 1097 #Read in next line 1098 line = line_iter.next() 1099 except StopIteration : 1100 raise ValueError("Problem in header")
1101
1102 - def _feed_misc_lines(self, consumer, lines) :
1103 #Deals with a few misc lines between the features and the sequence 1104 GENBANK_INDENT = self.HEADER_WIDTH 1105 GENBANK_SPACER = " "*GENBANK_INDENT 1106 lines.append("") 1107 line_iter = iter(lines) 1108 try : 1109 for line in line_iter : 1110 if line.find('BASE COUNT')==0 : 1111 line = line[10:].strip() 1112 if line : 1113 if self.debug : print "base_count = " + line 1114 consumer.base_count(line) 1115 if line.find("ORIGIN")==0 : 1116 line = line[6:].strip() 1117 if line : 1118 if self.debug : print "origin_name = " + line 1119 consumer.origin_name(line) 1120 if line.find("WGS ")==0 : 1121 line = line[3:].strip() 1122 consumer.wgs(line) 1123 if line.find("WGS_SCAFLD")==0 : 1124 line = line[10:].strip() 1125 consumer.add_wgs_scafld(line) 1126 if line.find("CONTIG")==0 : 1127 line = line[6:].strip() 1128 contig_location = line 1129 while True : 1130 line = line_iter.next() 1131 if not line : 1132 break 1133 elif line[:GENBANK_INDENT]==GENBANK_SPACER : 1134 #Don't need to preseve the whitespace here. 1135 contig_location += line[GENBANK_INDENT:].rstrip() 1136 else: 1137 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1138 consumer.contig_location(contig_location) 1139 return 1140 except StopIteration : 1141 raise ValueError("Problem in misc lines before sequence")
1142 1143 if __name__ == "__main__" : 1144 from StringIO import StringIO 1145 1146 gbk_example = \ 1147 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1148 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1149 (AXL2) and Rev7p (REV7) genes, complete cds. 1150 ACCESSION U49845 1151 VERSION U49845.1 GI:1293613 1152 KEYWORDS . 1153 SOURCE Saccharomyces cerevisiae (baker's yeast) 1154 ORGANISM Saccharomyces cerevisiae 1155 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1156 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1157 REFERENCE 1 (bases 1 to 5028) 1158 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1159 TITLE Cloning and sequence of REV7, a gene whose function is required for 1160 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1161 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1162 PUBMED 7871890 1163 REFERENCE 2 (bases 1 to 5028) 1164 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1165 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1166 plasma membrane glycoprotein 1167 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1168 PUBMED 8846915 1169 REFERENCE 3 (bases 1 to 5028) 1170 AUTHORS Roemer,T. 1171 TITLE Direct Submission 1172 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1173 Haven, CT, USA 1174 FEATURES Location/Qualifiers 1175 source 1..5028 1176 /organism="Saccharomyces cerevisiae" 1177 /db_xref="taxon:4932" 1178 /chromosome="IX" 1179 /map="9" 1180 CDS <1..206 1181 /codon_start=3 1182 /product="TCP1-beta" 1183 /protein_id="AAA98665.1" 1184 /db_xref="GI:1293614" 1185 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1186 AEVLLRVDNIIRARPRTANRQHM" 1187 gene 687..3158 1188 /gene="AXL2" 1189 CDS 687..3158 1190 /gene="AXL2" 1191 /note="plasma membrane glycoprotein" 1192 /codon_start=1 1193 /function="required for axial budding pattern of S. 1194 cerevisiae" 1195 /product="Axl2p" 1196 /protein_id="AAA98666.1" 1197 /db_xref="GI:1293615" 1198 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1199 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1200 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1201 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1202 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1203 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1204 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1205 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1206 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1207 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1208 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1209 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1210 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1211 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1212 VDFSNKSNVNVGQVKDIHGRIPEML" 1213 gene complement(3300..4037) 1214 /gene="REV7" 1215 CDS complement(3300..4037) 1216 /gene="REV7" 1217 /codon_start=1 1218 /product="Rev7p" 1219 /protein_id="AAA98667.1" 1220 /db_xref="GI:1293616" 1221 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1222 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1223 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1224 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1225 LISGDDKILNGVYSQYEEGESIFGSLF" 1226 ORIGIN 1227 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1228 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1229 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1230 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1231 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1232 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1233 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1234 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1235 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1236 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1237 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1238 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1239 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1240 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1241 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1242 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1243 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1244 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1245 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1246 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1247 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1248 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1249 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1250 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1251 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1252 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1253 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1254 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1255 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1256 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1257 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1258 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1259 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1260 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1261 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1262 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1263 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1264 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1265 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1266 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1267 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1268 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1269 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1270 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1271 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1272 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1273 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1274 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1275 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1276 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1277 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1278 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1279 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1280 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1281 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1282 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1283 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1284 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1285 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1286 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1287 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1288 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1289 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1290 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1291 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1292 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1293 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1294 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1295 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1296 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1297 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1298 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1299 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1300 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1301 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1302 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1303 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1304 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1305 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1306 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1307 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1308 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1309 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1310 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1311 //""" 1312 1313 # GenBank format protein (aka GenPept) file from: 1314 # http://www.molecularevolution.org/resources/fileformats/ 1315 gbk_example2 = \ 1316 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1317 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1318 ACCESSION AAD51968 1319 VERSION AAD51968.1 GI:5805369 1320 DBSOURCE locus AF171097 accession AF171097.1 1321 KEYWORDS . 1322 SOURCE Yersinia enterocolitica 1323 ORGANISM Yersinia enterocolitica 1324 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1325 Enterobacteriaceae; Yersinia. 1326 REFERENCE 1 (residues 1 to 143) 1327 AUTHORS Revell,P.A. and Miller,V.L. 1328 TITLE A chromosomally encoded regulator is required for expression of the 1329 Yersinia enterocolitica inv gene and for virulence 1330 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1331 MEDLINE 20138369 1332 PUBMED 10672189 1333 REFERENCE 2 (residues 1 to 143) 1334 AUTHORS Revell,P.A. and Miller,V.L. 1335 TITLE Direct Submission 1336 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1337 University School of Medicine, Campus Box 8230, 660 South Euclid, 1338 St. Louis, MO 63110, USA 1339 COMMENT Method: conceptual translation. 1340 FEATURES Location/Qualifiers 1341 source 1..143 1342 /organism="Yersinia enterocolitica" 1343 /mol_type="unassigned DNA" 1344 /strain="JB580v" 1345 /serotype="O:8" 1346 /db_xref="taxon:630" 1347 Protein 1..143 1348 /product="transcriptional regulator RovA" 1349 /name="regulates inv expression" 1350 CDS 1..143 1351 /gene="rovA" 1352 /coded_by="AF171097.1:380..811" 1353 /note="regulator of virulence" 1354 /transl_table=11 1355 ORIGIN 1356 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1357 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1358 121 deiellsgli dklerniiql qsk 1359 // 1360 """ 1361 1362 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1363 XX 1364 AC X56734; S46826; 1365 XX 1366 DT 12-SEP-1991 (Rel. 29, Created) 1367 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1368 XX 1369 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1370 XX 1371 KW beta-glucosidase. 1372 XX 1373 OS Trifolium repens (white clover) 1374 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1375 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1376 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1377 XX 1378 RN [5] 1379 RP 1-1859 1380 RX PUBMED; 1907511. 1381 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1382 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1383 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1384 RL Plant Mol. Biol. 17(2):209-219(1991). 1385 XX 1386 RN [6] 1387 RP 1-1859 1388 RA Hughes M.A.; 1389 RT ; 1390 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1391 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1392 RL Upon Tyne, NE2 4HH, UK 1393 XX 1394 FH Key Location/Qualifiers 1395 FH 1396 FT source 1..1859 1397 FT /organism="Trifolium repens" 1398 FT /mol_type="mRNA" 1399 FT /clone_lib="lambda gt10" 1400 FT /clone="TRE361" 1401 FT /tissue_type="leaves" 1402 FT /db_xref="taxon:3899" 1403 FT CDS 14..1495 1404 FT /product="beta-glucosidase" 1405 FT /EC_number="3.2.1.21" 1406 FT /note="non-cyanogenic" 1407 FT /db_xref="GOA:P26204" 1408 FT /db_xref="InterPro:IPR001360" 1409 FT /db_xref="InterPro:IPR013781" 1410 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1411 FT /protein_id="CAA40058.1" 1412 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1413 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1414 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1415 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1416 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1417 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1418 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1419 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1420 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1421 FT mRNA 1..1859 1422 FT /experiment="experimental evidence, no additional details 1423 FT recorded" 1424 XX 1425 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1426 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1427 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1428 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1429 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1430 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1431 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1432 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1433 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1434 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1435 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1436 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1437 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1438 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1439 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1440 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1441 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1442 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1443 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1444 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1445 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1446 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1447 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1448 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1449 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1450 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1451 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1452 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1453 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1454 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1455 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1456 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1457 // 1458 """ 1459 1460 print "GenBank CDS Iteration" 1461 print "=====================" 1462 1463 g = GenBankScanner() 1464 for record in g.parse_cds_features(StringIO(gbk_example)) : 1465 print record 1466 1467 g = GenBankScanner() 1468 for record in g.parse_cds_features(StringIO(gbk_example2), 1469 tags2id=('gene','locus_tag','product')) : 1470 print record 1471 1472 g = GenBankScanner() 1473 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1474 tags2id=('gene','locus_tag','product')) : 1475 print record 1476 1477 print 1478 print "GenBank Iteration" 1479 print "=================" 1480 g = GenBankScanner() 1481 for record in g.parse_records(StringIO(gbk_example),do_features=False) : 1482 print record.id, record.name, record.description 1483 print record.seq 1484 1485 g = GenBankScanner() 1486 for record in g.parse_records(StringIO(gbk_example),do_features=True) : 1487 print record.id, record.name, record.description 1488 print record.seq 1489 1490 g = GenBankScanner() 1491 for record in g.parse_records(StringIO(gbk_example2),do_features=False) : 1492 print record.id, record.name, record.description 1493 print record.seq 1494 1495 g = GenBankScanner() 1496 for record in g.parse_records(StringIO(gbk_example2),do_features=True) : 1497 print record.id, record.name, record.description 1498 print record.seq 1499 1500 print 1501 print "EMBL CDS Iteration" 1502 print "==================" 1503 1504 e = EmblScanner() 1505 for record in e.parse_cds_features(StringIO(embl_example)) : 1506 print record 1507 1508 print 1509 print "EMBL Iteration" 1510 print "==============" 1511 e = EmblScanner() 1512 for record in e.parse_records(StringIO(embl_example),do_features=True) : 1513 print record.id, record.name, record.description 1514 print record.seq 1515