Package Bio :: Package expressions :: Module genbank
[hide private]
[frames] | no frames]

Source Code for Module Bio.expressions.genbank

  1  """Martel based parser to read GenBank formatted files. 
  2   
  3  This is a huge regular regular expression for GenBank, built using 
  4  the 'regular expressions on steroids' capabilities of Martel. 
  5   
  6  Documentation for GenBank format that I found: 
  7   
  8  o GenBank/EMBL feature tables are described at: 
  9  http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html 
 10   
 11  o There are also descriptions of different GenBank lines at: 
 12  http://www.ibc.wustl.edu/standards/gbrel.txt 
 13  """ 
 14   
 15  import warnings 
 16  warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 17   
 18   
 19  # Martel 
 20  import Martel 
 21  from Martel import RecordReader 
 22   
 23  # identify certain items as important for format converters 
 24  from Bio import Std 
 25   
 26  # --- first set up some helper constants and functions 
 27   
 28  # - useful constants for dealing with the blank space in GenBank documents 
 29  # this is useful since blank space can be significant in GenBank flat files. 
 30  INDENT = 12 
 31  FEATURE_KEY_INDENT = 5 
 32  FEATURE_QUALIFIER_INDENT = 21 
 33   
 34  blank_space = Martel.Spaces() 
 35  small_indent_space = Martel.Str(" " * 2) 
 36  big_indent_space = Martel.Str(" " * FEATURE_KEY_INDENT) 
 37  qualifier_space = Martel.Str(" " * FEATURE_QUALIFIER_INDENT) | \ 
 38                    Martel.Str("\t" + " " * (FEATURE_QUALIFIER_INDENT - 8)) 
 39   
 40  # - useful functions 
41 -def define_block(identifier, block_tag, block_data, std_block_tag = None, 42 std_tag = None):
43 """Define a Martel grouping which can parse a block of text. 44 45 Many of the GenBank lines we'll want to process are grouped into 46 a block like: 47 48 IDENTIFIER Blah blah blah 49 50 Where blah blah blah can wrap for multiple lines. This function makes 51 it easy to consistently define a definition for these blocks. 52 53 Arguments: 54 o identifier - The identifier that begins the block (like DEFINITION). 55 o block_tag - A callback tag for the entire block. 56 o block_data - A callback tag for the data in the block (ie. the 57 stuff you are interested in). 58 o std_block_tag - A Bio.Std Martel tag used to register the entire 59 block as having being a "standard" type of information. 60 o std_tag - A Bio.Std Martel tag used to register just the information 61 in the block as being "standard" 62 """ 63 diff = INDENT - len(identifier) 64 assert diff > 0, diff 65 66 # if no std_tag info is defined, just make std_tag a no-op function 67 if std_tag is None: 68 def do_nothing(martel_info): 69 return martel_info
70 std_tag = do_nothing 71 72 identifier_and_text = Martel.Str(identifier) + \ 73 Martel.Rep(Martel.Str(" ")) + \ 74 std_tag(Martel.UntilEol(block_data)) + \ 75 Martel.AnyEol() 76 indented_text = Martel.Str(" "*INDENT) + \ 77 std_tag(Martel.UntilEol(block_data)) + \ 78 Martel.AnyEol() 79 block_info = Martel.Group( 80 block_tag, 81 identifier_and_text + 82 Martel.Rep(Martel.Alt(Martel.AnyEol(), indented_text)) 83 ) 84 # tag the info as some standard Martel element if specified 85 if std_block_tag is not None: 86 block_info = std_block_tag(block_info) 87 88 return block_info 89 90 91 # first line 92 # LOCUS AC007323 86436 bp DNA PLN 19-JAN-2000 93 locus = Martel.Group("locus", 94 Martel.Re(r"[\w\-]+")) 95 size = Martel.Group("size", 96 Martel.Rep1(Martel.Integer())) 97 98 # deal with the different kinds of residues we can have 99 valid_residue_prefixes = ["ss-", "ds-", "ms-"] 100 valid_residue_types = ["DNA", "RNA", "mRNA", "tRNA", "rRNA", "uRNA", 101 "scRNA", "snRNA", "snoRNA", "PROTEIN"] 102 103 residue_prefixes = map(Martel.Str, valid_residue_prefixes) 104 residue_types = map(Martel.Str, valid_residue_types) 105 106 residue_type = Martel.Group("residue_type", 107 Martel.Opt(Martel.Alt(*residue_prefixes)) + 108 Martel.Opt(Martel.Alt(*residue_types)) + 109 Martel.Opt(Martel.Opt(blank_space) + 110 Martel.Alt(Martel.Str("circular"), 111 Martel.Str("linear")))) 112 113 date = Martel.Group("date", 114 Martel.Re("[-\w]+")) 115 116 # the PLN, etc stuff indicates data file divisions 117 valid_divisions = ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "RNA", 118 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", "GSS", 119 "HTG", "HTC", "CON", "ENV"] 120 divisions = map(Martel.Str, valid_divisions) 121 data_file_division = Martel.Group("data_file_division", 122 Martel.Alt(*divisions)) 123 124 locus_line = Martel.Group("locus_line", 125 Martel.Str("LOCUS") + 126 blank_space + 127 locus + 128 blank_space + 129 size + 130 blank_space + 131 Martel.Re("bp|aa") + 132 blank_space + 133 Martel.Opt(residue_type + 134 blank_space) + 135 data_file_division + 136 blank_space + 137 date + 138 Martel.AnyEol()) 139 140 # definition line 141 # DEFINITION Genomic sequence for Arabidopsis thaliana BAC T25K16 from 142 # chromosome I, complete sequence. 143 definition_block = define_block("DEFINITION", "definition_block", 144 "definition", Std.description_block, 145 Std.description) 146 147 # accession line 148 # ACCESSION AC007323 149 # or 150 # ACCESSION NC_004353 REGION: 1..1281640 151 accession = Martel.Group("accession", 152 Martel.Re("[\w]+")) 153 154 region = Martel.Group("region", 155 Martel.Re("[\d]+..[\d]+")) 156 157 accession_block = Martel.Group("accession_block", 158 Martel.Str("ACCESSION") + 159 Martel.Rep1(blank_space + 160 Martel.Rep1(accession + 161 Martel.Opt( 162 Martel.Opt(Martel.Str(" ")) + 163 Martel.Str("REGION:") + 164 Martel.Opt(Martel.Str(" ")) + 165 region) + 166 Martel.Opt(Martel.Str(" "))) + 167 Martel.AnyEol())) 168 169 170 # accession_block = define_block("ACCESSION", "accession_block", "accession") 171 172 # NID g44010 173 nid = Martel.Group("nid", 174 Martel.Re("[\w\d]+")) 175 nid_line = Martel.Group("nid_line", 176 Martel.Str("NID") + 177 blank_space + 178 nid + 179 Martel.AnyEol()) 180 181 # PID g6754304 182 pid = Martel.Group("pid", 183 Martel.Re("[\w\d]+")) 184 pid_line = Martel.Group("pid_line", 185 Martel.Str("PID") + 186 blank_space + 187 pid + 188 Martel.AnyEol()) 189 190 # version and GI line 191 # VERSION AC007323.5 GI:6587720 192 version = Martel.Group("version", 193 Std.dbid(Martel.Re("[\w\d\.]+"), 194 {"type" : "primary", "dbname" : "genbank"})) 195 196 gi = Martel.Group("gi", 197 Std.dbid(Martel.Re("[\d]+"), 198 {"type" : "secondary", "dbname" : "genbank"})) 199 200 version_line = Martel.Group("version_line", 201 Martel.Str("VERSION") + 202 blank_space + 203 version + 204 Martel.Opt(blank_space + 205 Martel.Str("GI:") + 206 gi) + 207 Martel.AnyEol()) 208 209 # DBSOURCE REFSEQ: accession NM_010510.1 210 db_source_block = define_block("DBSOURCE", "db_source_block", "db_source") 211 212 # keywords line 213 # KEYWORDS antifreeze protein homology; cold-regulated gene; cor6.6 gene; 214 # KIN1 homology. 215 keywords_block = define_block("KEYWORDS", "keywords_block", "keywords") 216 217 # SEGMENT 1 of 6 218 segment = Martel.Group("segment", 219 Martel.Integer("segment_num") + \ 220 Martel.Str(" of ") + \ 221 Martel.Integer("segment_total")) 222 segment_line = Martel.Group("segment_line", 223 Martel.Str("SEGMENT ") + segment + \ 224 Martel.AnyEol()) 225 226 # SOURCE thale cress. 227 source_block = define_block("SOURCE", "source_block", "source") 228 229 # ORGANISM Arabidopsis thaliana 230 # Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; 231 # Magnoliophyta; eudicotyledons; core eudicots; Rosidae; eurosids II; 232 # Brassicales; Brassicaceae; Arabidopsis. 233 organism = Martel.Group("organism", 234 Martel.ToEol()) 235 236 taxonomy = Martel.Group("taxonomy", 237 Martel.Rep1(blank_space + 238 Martel.ToEol())) 239 240 organism_block = Martel.Group("organism_block", 241 Martel.Str(" ORGANISM") + 242 blank_space + 243 organism + 244 taxonomy) 245 246 # REFERENCE 1 (bases 1 to 86436) 247 # AUTHORS Thomashow,M.F. 248 # TITLE Direct Submission 249 # JOURNAL Submitted (01-FEB-1991) M.F. Thomashow, Dept. Crop and Soil 250 # Sciences, Dept. Microbiology, Michigan State University, East 251 # Lansing, Michigan 48824, USA 252 reference_num = Martel.Group("reference_num", 253 Martel.Re("[\d]+")) 254 255 # can have normal references, like that shown above, or references like: 256 # REFERENCE 1 (sites) 257 # with no base information or even: 258 # REFERENCE 2 (bases 1 to 105654; 110423 to 111122) 259 reference_bases = Martel.Group("reference_bases", 260 Martel.Str("(") + 261 Martel.Re("[;\w\d \R]+") + 262 Martel.Str(")")) 263 reference_line = Martel.Group("reference_line", 264 Martel.Str("REFERENCE") + 265 blank_space + 266 reference_num + 267 Martel.Opt(blank_space + 268 reference_bases) + 269 Martel.AnyEol()) 270 271 authors_block = define_block(" AUTHORS", "authors_block", "authors") 272 consrtm_block = define_block(" CONSRTM", "consrtm_block", "consrtm") 273 title_block = define_block(" TITLE", "title_block", "title") 274 journal_block = define_block(" JOURNAL", "journal_block", "journal") 275 276 # MEDLINE 92119220 277 medline_line = Martel.Group("medline_line", 278 Martel.Str(" MEDLINE ") + 279 Martel.Integer("medline_id") + 280 Martel.AnyEol()) 281 282 # PUBMED 10617197 283 pubmed_line = Martel.Group("pubmed_line", 284 Martel.Str(" PUBMED ") + 285 Martel.Integer("pubmed_id") + 286 Martel.AnyEol()) 287 288 # REMARK This sequence is of BAC F10O3 from Arabidopsis thaliana chromosome 289 remark_block = define_block(" REMARK", "remark_block", "remark") 290 291 # an entire reference for the sequence 292 reference = Martel.Group("reference", 293 reference_line + 294 Martel.Opt(authors_block) + 295 Martel.Opt(consrtm_block) + 296 Martel.Opt(title_block) + 297 journal_block + 298 Martel.Opt(medline_line) + 299 Martel.Opt(pubmed_line) + 300 Martel.Opt(remark_block)) 301 302 # COMMENT On Dec 16, 1999 this sequence version replaced gi:5729683. 303 comment_block = define_block("COMMENT", "comment_block", "comment") 304 # PRIMARY 305 primary_line = Martel.Group("primary_line", 306 Martel.Str("PRIMARY") + 307 blank_space + 308 Martel.Str("TPA_SPAN") + 309 blank_space + 310 Martel.Str("PRIMARY_IDENTIFIER") + 311 blank_space + 312 Martel.Str("PRIMARY_SPAN") + 313 blank_space + 314 Martel.Str("COMP") + 315 Martel.ToEol()) 316 317 primary_ref_line =Martel.Group("primary_ref_line", 318 blank_space + 319 Martel.Re(r"\d+\-\d+") + 320 blank_space + 321 Martel.Re("[\S]+") + 322 blank_space + 323 Martel.Re("\d+\-\d+")+ 324 Martel.Opt(blank_space + Martel.Str("c"))+ 325 Martel.ToEol()) 326 327 primary = Martel.Group("primary",primary_line + 328 Martel.Rep1(primary_ref_line)) 329 330 # start on the feature table. Eeek -- This is the part I was afraid of 331 # most! 332 333 # the header, so that we know we are heading into some features 334 # FEATURES Location/Qualifiers 335 features_line = Martel.Group("features_line", 336 Martel.Str("FEATURES") + 337 blank_space + 338 Martel.Str("Location/Qualifiers") + 339 Martel.AnyEol()) 340 341 # feature key names are basically words, but sometimes have additional 342 # characters ("-10_signal", "3'UTR"...) 343 feature_key = Martel.Group("feature_key", 344 Martel.Re("[\w'-]+")) 345 """ 346 location = Martel.Group("location", 347 Martel.ToEol("feature_location") + \ 348 Martel.Rep(qualifier_space + \ 349 Martel.Re("(?!/)") + \ 350 Martel.ToEol("feature_location"))) 351 """ 352 353 location = Martel.Group("location", 354 Std.feature_location(Martel.UntilEol()) + 355 Martel.AnyEol() + 356 Martel.Rep(qualifier_space + 357 Martel.AssertNot(Martel.Str("/")) + 358 Std.feature_location(Martel.UntilEol()) + 359 Martel.AnyEol()) 360 ) 361 362 feature_key_line = Martel.Group("feature_key_line", 363 big_indent_space + 364 Std.feature_name(feature_key) + 365 location) 366 367 # qualifiers escape quotes using double quotes 368 quote = Martel.Str('"') 369 quoted_chars = Std.feature_qualifier_description(Martel.Re(r'([^"\R]|"")*')) 370 371 quoted_string = (quote + quoted_chars + 372 Martel.Rep(Martel.AnyEol() + qualifier_space + quoted_chars) + 373 quote + Martel.AnyEol()) 374 375 unquoted_string = Martel.AssertNot(quote) + \ 376 Std.feature_qualifier_description(Martel.UntilEol()) + \ 377 Martel.AnyEol() 378 379 qualifier = Std.feature_qualifier( 380 qualifier_space + 381 Martel.Str("/") + 382 Std.feature_qualifier_name(Martel.Word("feature_qualifier_name")) + 383 (Martel.AnyEol() | # '/pseudo' 384 (Martel.Str("=") + 385 Martel.Group("feature_qualifier_description", 386 (unquoted_string | # '/evidence=experimental' 387 quoted_string)))) # '/translation="AAAAAAAA.... 388 # AAAAAAAAAAAAAAAAAAAA' 389 ) 390 391 feature = Std.feature(feature_key_line + 392 Martel.Rep(qualifier)) 393 394 feature_block = Std.feature_block(Martel.Rep1(feature), 395 {"location-style" : "genbank"}) 396 397 # BASE COUNT 28300 a 15069 c 15360 g 27707 t 398 base_count = Martel.Group("base_count", 399 Martel.Re("[\w\d ]+")) 400 base_count_line = Martel.Group("base_count_line", 401 Martel.Str("BASE COUNT") + 402 blank_space + 403 base_count + 404 Martel.AnyEol()) 405 406 # ORIGIN 407 # 1 ggacaaggcc aaggatgctg ctgctgcagc tggagcttcc gcgcaacaag taaacagata 408 origin_line = Martel.Group("origin_line", 409 Martel.Str("ORIGIN") + 410 (Martel.ToEol("origin_name") | 411 Martel.AnyEol())) 412 413 base_number = Martel.Group("base_number", 414 Martel.Re("[\d]+")) 415 sequence = Std.sequence(Martel.Group("sequence", 416 Martel.Re("[\w]+"))) 417 sequence_plus_spaces = Martel.Group("sequence_plus_spaces", 418 Martel.Rep1(Martel.Str(" ") + 419 Martel.Opt(sequence)) + 420 Martel.Opt(Martel.Str(" "))) 421 sequence_line = Martel.Group("sequence_line", 422 blank_space + 423 Martel.Opt(base_number) + 424 sequence_plus_spaces + 425 Martel.AnyEol()) 426 427 sequence_entry = Std.sequence_block(Martel.Group("sequence_entry", 428 origin_line + 429 Martel.Rep1(sequence_line))) 430 431 # CONTIG 432 # this is the contig information for RefSeq records 433 434 contig_location = Martel.Group("contig_location", 435 Martel.ToEol("feature_location") + \ 436 Martel.Rep(Martel.Str(" " * INDENT) + \ 437 Martel.Re("(?!/)") + \ 438 Martel.ToEol("feature_location"))) 439 440 contig_block = Martel.Group("contig_block", 441 Martel.Str("CONTIG") + 442 blank_space + 443 contig_location) 444 445 # all done! 446 # // 447 record_end = Martel.Group("record_end", 448 Martel.Str("//") + 449 Martel.Rep1(Martel.AnyEol())) 450 451 record = Std.record(Martel.Group("genbank_record", 452 locus_line + \ 453 definition_block + \ 454 accession_block + \ 455 Martel.Opt(nid_line) + \ 456 Martel.Opt(pid_line) + \ 457 Martel.Opt(version_line) + \ 458 Martel.Opt(db_source_block) + \ 459 keywords_block + \ 460 Martel.Opt(segment_line) + \ 461 source_block + \ 462 organism_block + \ 463 Martel.Rep(reference) + \ 464 Martel.Opt(primary) +\ 465 Martel.Opt(comment_block) + \ 466 features_line + \ 467 feature_block + \ 468 Martel.Alt(Martel.Opt(base_count_line) + 469 sequence_entry, 470 contig_block) + \ 471 record_end)) 472 473 # if you download a big mess of GenBank files, it'll have a header 474 # in that case you should be using 'ncbi_format' instead of the standard 475 # 'format' 476 header = Martel.Re("""\ 477 (?P<filename>[^ ]+) +Genetic Sequence Data Bank 478 *(?P<release_day>\d+) (?P<release_month>\w+) (?P<release_year>\d+) 479 480 *(?P<data_bank_name>[^\R]+) 481 482 *(?P<data_bank_name>[^\R]+) 483 484 *(?P<num_loci>\d+) loci, *(?P<num_bases>\d+) bases, from *(?P<num_reports>\d+) reported sequences 485 486 487 """) 488 489 ncbi_format = Martel.HeaderFooter("genbank", {"format" : "ncbi_genbank"}, 490 header, RecordReader.CountLines, (10,), 491 record, RecordReader.EndsWith, ("//",), 492 None, None, None, 493 ) 494 495 format = Martel.ParseRecords("genbank", {"format" : "genbank"}, 496 record, RecordReader.StartsWith, ("LOCUS ",)) 497