Martel based parser to read GenBank formatted files.
This is a huge regular regular expression for GenBank, built using the
'regular expressions on steroids' capabilities of Martel.
o GenBank/EMBL feature tables are described at:
http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
o There are also descriptions of different GenBank lines at:
http://www.ibc.wustl.edu/standards/gbrel.txt
|
INDENT = 12
|
|
FEATURE_KEY_INDENT = 5
|
|
FEATURE_QUALIFIER_INDENT = 21
|
|
blank_space = Martel.Spaces()
|
|
small_indent_space = Martel.Str(" "* 2)
|
|
big_indent_space = Martel.Str(" "* FEATURE_KEY_INDENT)
|
|
qualifier_space = Martel.Str(" "* FEATURE_QUALIFIER_INDENT) | ...
|
|
locus = Martel.Group("locus", Martel.Re(r"[\w\-]+"))
|
|
size = Martel.Group("size", Martel.Rep1(Martel.Integer()))
|
|
valid_residue_prefixes = [ ' ss- ' , ' ds- ' , ' ms- ' ]
|
|
valid_residue_types = [ ' DNA ' , ' RNA ' , ' mRNA ' , ' tRNA ' , ' rRNA ' , ' ...
|
|
residue_prefixes = map(Martel.Str, valid_residue_prefixes)
|
|
residue_types = map(Martel.Str, valid_residue_types)
|
|
residue_type = Martel.Group("residue_type", Martel.Opt(Martel....
|
|
date = Martel.Group("date", Martel.Re("[-\w]+"))
|
|
valid_divisions = [ ' PRI ' , ' ROD ' , ' MAM ' , ' VRT ' , ' INV ' , ' PLN ' , ' ...
|
|
divisions = map(Martel.Str, valid_divisions)
|
|
data_file_division = Martel.Group("data_file_division", Martel...
|
|
locus_line = Martel.Group("locus_line", Martel.Str("LOCUS")+ b...
|
|
definition_block = define_block("DEFINITION", "definition_bloc...
|
|
accession = Martel.Group("accession", Martel.Re("[\w]+"))
|
|
region = Martel.Group("region", Martel.Re("[\d]+..[\d]+"))
|
|
accession_block = Martel.Group("accession_block", Martel.Str("...
|
|
nid = Martel.Group("nid", Martel.Re("[\w\d]+"))
|
|
nid_line = Martel.Group("nid_line", Martel.Str("NID")+ blank_s...
|
|
pid = Martel.Group("pid", Martel.Re("[\w\d]+"))
|
|
pid_line = Martel.Group("pid_line", Martel.Str("PID")+ blank_s...
|
|
version = Martel.Group("version", Std.dbid(Martel.Re("[\w\d\.]...
|
|
gi = Martel.Group("gi", Std.dbid(Martel.Re("[\d]+"), {"type": ...
|
|
version_line = Martel.Group("version_line", Martel.Str("VERSIO...
|
|
db_source_block = define_block("DBSOURCE", "db_source_block", ...
|
|
keywords_block = define_block("KEYWORDS", "keywords_block", "k...
|
|
segment = Martel.Group("segment", Martel.Integer("segment_num"...
|
|
segment_line = Martel.Group("segment_line", Martel.Str("SEGMEN...
|
|
source_block = define_block("SOURCE", "source_block", "source")
|
|
organism = Martel.Group("organism", Martel.ToEol())
|
|
taxonomy = Martel.Group("taxonomy", Martel.Rep1(blank_space+ M...
|
|
organism_block = Martel.Group("organism_block", Martel.Str(" ...
|
|
reference_num = Martel.Group("reference_num", Martel.Re("[\d]+"))
|
|
reference_bases = Martel.Group("reference_bases", Martel.Str("...
|
|
reference_line = Martel.Group("reference_line", Martel.Str("RE...
|
|
authors_block = define_block(" AUTHORS", "authors_block", "au...
|
|
consrtm_block = define_block(" CONSRTM", "consrtm_block", "co...
|
|
title_block = define_block(" TITLE", "title_block", "title")
|
|
journal_block = define_block(" JOURNAL", "journal_block", "jo...
|
|
medline_line = Martel.Group("medline_line", Martel.Str(" MEDL...
|
|
pubmed_line = Martel.Group("pubmed_line", Martel.Str(" PUBME...
|
|
remark_block = define_block(" REMARK", "remark_block", "remark")
|
|
reference = Martel.Group("reference", reference_line+ Martel.O...
|
|
comment_block = define_block("COMMENT", "comment_block", "comm...
|
|
primary_line = Martel.Group("primary_line", Martel.Str("PRIMAR...
|
|
primary_ref_line = Martel.Group("primary_ref_line", blank_spac...
|
|
primary = Martel.Group("primary", primary_line+ Martel.Rep1(pr...
|
|
features_line = Martel.Group("features_line", Martel.Str("FEAT...
|
|
feature_key = Martel.Group("feature_key", Martel.Re("[\w'-]+"))
location = Martel.Group("location",...
|
|
location = Martel.Group("location", Std.feature_location(Marte...
|
|
feature_key_line = Martel.Group("feature_key_line", big_indent...
|
|
quote = Martel.Str('"')
|
|
quoted_chars = Std.feature_qualifier_description(Martel.Re(r'(...
|
|
quoted_string = quote+ quoted_chars+ Martel.Rep(Martel.AnyEol(...
|
|
unquoted_string = Martel.AssertNot(quote)+ Std.feature_qualifi...
|
|
qualifier = Std.feature_qualifier(qualifier_space+ Martel.Str(...
|
|
feature = Std.feature(feature_key_line+ Martel.Rep(qualifier))
|
|
feature_block = Std.feature_block(Martel.Rep1(feature), {"loca...
|
|
base_count = Martel.Group("base_count", Martel.Re("[\w\d ]+"))
|
|
base_count_line = Martel.Group("base_count_line", Martel.Str("...
|
|
origin_line = Martel.Group("origin_line", Martel.Str("ORIGIN")...
|
|
base_number = Martel.Group("base_number", Martel.Re("[\d]+"))
|
|
sequence = Std.sequence(Martel.Group("sequence", Martel.Re("[\...
|
|
sequence_plus_spaces = Martel.Group("sequence_plus_spaces", Ma...
|
|
sequence_line = Martel.Group("sequence_line", blank_space+ Mar...
|
|
sequence_entry = Std.sequence_block(Martel.Group("sequence_ent...
|
|
contig_location = Martel.Group("contig_location", Martel.ToEol...
|
|
contig_block = Martel.Group("contig_block", Martel.Str("CONTIG...
|
|
record_end = Martel.Group("record_end", Martel.Str("//")+ Mart...
|
|
record = Std.record(Martel.Group("genbank_record", locus_line+...
|
|
header = Martel.Re("...
|
|
ncbi_format = Martel.HeaderFooter("genbank", {"format": "ncbi_...
|
|
format = Martel.ParseRecords("genbank", {"format": "genbank"},...
|
|
__warningregistry__ = { ( ' Bio.expressions was deprecated, as it ...
|