Package Martel
[hide private]
[frames] | no frames]

Source Code for Package Martel

  1  # Copyright 2000-2001, Dalke Scientific Software, LLC 
  2  # Distributed under the Biopython License Agreement (see the LICENSE file). 
  3   
  4  """Martel is a 'regular expressions on steroids' parser generator (DEPRECATED). 
  5   
  6  A goal of the Biopython project is to reduce the amount of effort needed to 
  7  do computational biology. A large part of that work turns out to be parsing 
  8  file formats, which lead to the development of Martel, a parser generator 
  9  which uses a regular expression as the format description to create a parser 
 10  that returns the parse tree using the SAX API common in XML processing. 
 11   
 12  While intended to be both fast and relatively easy to understand, Martel did 
 13  struggle with some very large records (e.g. GenBank files for whole genomes or 
 14  chromosomes), and in practice debugging the Martel format specifications for 
 15  evolving file formats like GenBank proved non-trivial. 
 16   
 17  Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules 
 18  are now deprecated.  They are no longer used in any of the current Biopython 
 19  parsers, and are likely to be removed in a future release of Biopython. 
 20  """ 
 21  __version__ = "1.50" 
 22   
 23  import warnings 
 24  warnings.warn("Martel and those parts of Biopython depending on it" \ 
 25                +" directly, such as Bio.Mindy, are now deprecated, and will" \ 
 26                +" be removed in a future release of Biopython.  If you want"\ 
 27                +" to continue to use this code, please get in contact with"\ 
 28                +" the Biopython developers via the mailing lists to avoid"\ 
 29                +" its permanent removal from Biopython", \ 
 30                DeprecationWarning) 
 31   
 32  import Expression 
 33  import convert_re 
 34  import string 
 35  from xml.sax import xmlreader 
 36   
 37  # This interface is based off of Greg Ewing's Plex parser. 
 38   
39 -def Str1(s):
40 """(s) -> match the literal string""" 41 return Expression.Str(s)
42
43 -def Str(*args):
44 """(s1, s2, ...) -> match s1 or s2 or ...""" 45 if len(args) == 1: 46 return Str1(args[0]) 47 return Expression.Alt(tuple(map(Str, args)))
48
49 -def Any(s):
50 """(s) -> match any character in s""" 51 if len(s) == 1: 52 return Expression.Literal(s) 53 return Expression.Any(s, 0)
54
55 -def AnyBut(s):
56 """s -> match any character not in s""" 57 return Expression.Any(s, 1)
58 59 ## Untested! 60 ##def AnyChar(): 61 ## """match any character, including newline""" 62 ## return Expression.Re("(?:.|\n)") 63 64
65 -def Seq(*args):
66 """exp1, exp2, ... -> match exp1 followed by exp2 followed by ...""" 67 # I'm always forgetting and passing strings into this function, 68 # so make sure the arguments are expressions 69 for arg in args: 70 assert isinstance(arg, Expression.Expression), \ 71 "expecting an Expression, not a %s" % type(arg) 72 73 return Expression.Seq(args)
74
75 -def Alt(*args):
76 """exp1, exp2, ... -> match exp1 or (if that fails) match exp2 or ...""" 77 # Do some type checking 78 for arg in args: 79 assert isinstance(arg, Expression.Expression), \ 80 "expecting an Expression, not a %s" % type(arg) 81 return Expression.Alt(args)
82
83 -def Opt(expr):
84 """expr -> match 'expr' 1 or 0 times""" 85 assert isinstance(expr, Expression.Expression), \ 86 "expecting an Expression, not a %s" % type(expr) 87 return Expression.MaxRepeat(expr, 0, 1)
88
89 -def Rep(expr):
90 """expr -> match 'expr' as many times as possible, even 0 time""" 91 assert isinstance(expr, Expression.Expression), \ 92 "expecting an Expression, not a %s" % type(expr) 93 return Expression.MaxRepeat(expr, 0)
94
95 -def Rep1(expr):
96 """expr -> match 'expr' as many times as possible, but at least once""" 97 assert isinstance(expr, Expression.Expression), \ 98 "expecting an Expression, not a %s" % type(expr) 99 return Expression.MaxRepeat(expr, 1)
100 101 102 # These are in Plex, but I don't (yet?) implement them 103 104 NoCase = Expression.NoCase 105
106 -def Case(expr):
107 raise NotImplementedError
108
109 -def Bol():
110 raise NotImplementedError
111
112 -def Eol():
113 raise NotImplementedError
114
115 -def Empty():
116 raise NotImplementedError
117
118 -def Eof():
119 raise NotImplementedError 120 return Expression.AtEnd()
121 122 123 # Not in Plex, but useful 124 125 AnyEol = Expression.AnyEol 126
127 -def MaxRepeat(expr, min_count, max_count = Expression.MAXREPEAT):
128 """expr, min_count, max_count = 65535 -> match between min- and max_count times 129 130 If max_count == 65535 (which is Expression.MAXREPEAT) then there 131 is no upper limit. 132 """ 133 assert isinstance(expr, Expression.Expression), \ 134 "expecting an Expression, not a %s" % type(expr) 135 return Expression.MaxRepeat(expr, min_count, max_count)
136
137 -def RepN(expr, count):
138 """expr, count -> match the expression 'count' number of time 139 140 This option is handy for named group repeats since you don't have 141 to use the name twice; for the min_count and max_count fields. 142 """ 143 return Expression.MaxRepeat(expr, count, count)
144 145
146 -def Group(name, expr, attrs = None):
147 """name, expr -> use 'name' to describe a successful match of the expression""" 148 assert isinstance(expr, Expression.Expression), \ 149 "expecting an Expression, not a %s" % type(expr) 150 return Expression.Group(name, expr, attrs)
151 152
153 -def _fix_newlines(s):
154 # Replace the characters "\r\n", "\r" and "\n" with \R. 155 # Does not affect the substrings '\' + 'n' or '\' + 'n' 156 s = string.replace(s, "\r\n", "\n") 157 s = string.replace(s, "\r", "\n") 158 return string.replace(s, "\n", r"\R")
159 160
161 -def Re(pattern, fix_newlines = 0):
162 """pattern -> the expression tree for the regexp pattern string""" 163 if fix_newlines: 164 pattern = _fix_newlines(pattern) 165 return convert_re.make_expression(pattern)
166 167 NullOp = Expression.NullOp 168 Debug = Expression.Debug 169
170 -def Assert(expression):
171 return Expression.Assert(expression)
172
173 -def AssertNot(expression):
174 return Expression.Assert(expression, invert = 1)
175 176 # helper function
177 -def _group(name, exp, attrs):
178 if name is None: 179 assert not attrs, "Attributes (%s) require a group name" % (attrs,) 180 return exp 181 return Group(name, exp, attrs)
182
183 -def Digits(name = None, attrs = None):
184 """match one or more decimal digits 185 186 This is the same as (?P<name?attrs>\d+). 187 188 If 'name' is not None, the matching text will be put inside a 189 group of the given name. You can optionally include group 190 attributes. 191 """ 192 return _group(name, Re(r"\d+"), attrs)
193
194 -def Integer(name = None, attrs = None):
195 """match an integer (digits w/ optional leading + or - sign) 196 197 If 'name' is not None, the matching text will be put inside a 198 group of the given name. You can optionally include group 199 attributes. 200 """ 201 exp = Re(r"[+-]?\d+") 202 return _group(name, exp, attrs)
203
204 -def Float(name = None, attrs = None):
205 """match floating point numbers like 6, 6., -.1, 2.3, +4E-5, ... 206 207 If 'name' is not None, the matching text will be put inside of a 208 group of the given name. You can optionally include group 209 attributes. 210 """ 211 exp = Re(r"[+-]?((\d+(\.\d*)?)|\.\d+)([eE][+-]?[0-9]+)?") 212 return _group(name, exp, attrs)
213
214 -def Word(name = None, attrs = None):
215 """match a 'word' 216 217 A 'word' is defined as '\w+', and \w is [a-zA-Z0-9_]. 218 219 If 'name' is not None, the matching text will be put inside of a 220 group of the given name. You can optionally include group 221 attributes. 222 223 In other words, this is the short way to write (?P<name>\w+). 224 """ 225 exp = Re(r"\w+") 226 return _group(name, exp, attrs)
227
228 -def Spaces(name = None, attrs = None):
229 """match one or more whitespace (except newline) 230 231 "Spaces" is defined as [\\t\\v\\f\\r ]+, which is *not* the same 232 as '\\s+'. (It's missing the '\\n', which is useful since you 233 almost never mean for whitespace to go beyond the newline.) 234 235 If 'name' is not None, the matching text will be put inside of a 236 group of the given name. You can optionally include group 237 attributes. 238 """ 239 exp = Re(r"[\t\v\f ]+") 240 return _group(name, exp, attrs)
241
242 -def Unprintable(name = None, attrs = None):
243 """match an unprintable character (characters not in string.printable) 244 245 If 'name' is not None, the matching text will be put inside of a 246 group of the given name. You can optionally include group 247 attributes. 248 """ 249 return _group(name, AnyBut(string.printable), attrs)
250
251 -def Punctuation(name = None, attrs = None):
252 """match a punctuation character (characters in string.punctuation) 253 254 If 'name' is not None, the matching text will be put inside of a 255 group of the given name. You can optionally include group 256 attributes. 257 """ 258 return _group(name, Any(string.punctuation), attrs)
259 260
261 -def ToEol(name = None, attrs = None):
262 """match everything up to and including the end of line 263 264 If 'name' is not None, the matching text, except for the newline, 265 will be put inside a group of the given name. You can optionally 266 include group attributes. 267 """ 268 if name is None: 269 assert not attrs, "Attributes (%s) require a group name" % (attrs,) 270 return Re(r"[^\R]*\R") 271 else: 272 return Group(name, Re(r"[^\R]*"), attrs) + AnyEol()
273
274 -def UntilEol(name = None, attrs = None):
275 """match everything up to but not including the end of line 276 277 If 'name' is not None, the matching text, except for the newline, 278 will be put inside a group of the given name. You can optionally 279 include group attributes. 280 """ 281 if name is None: 282 assert not attrs, "Attributes (%s) require a group name" % (attrs,) 283 return Re(r"[^\R]*") 284 else: 285 return Group(name, Re(r"[^\R]*"), attrs)
286
287 -def SkipLinesUntil(expr):
288 """read and ignore lines up to, but excluding, the line matching expr""" 289 return Rep(AssertNot(expr) + ToEol())
290
291 -def SkipLinesTo(expr):
292 """read and ignore lines up to and including, the line matching expr""" 293 return Rep(AssertNot(expr) + ToEol()) + expr + ToEol()
294 295
296 -def ToSep(name = None, sep = None, attrs = None):
297 """match all characters up to the given seperator(s) 298 299 This is useful for parsing space, tab, color, or other character 300 delimited fields. There is no default seperator character. 301 302 If 'name' is not None, the matching text, except for the seperator 303 will be put inside a group of the given name. You can optionally 304 include group attributes. The seperator character will also be 305 consumed. 306 307 Neither "\\r" nor "\\n" may be used as a seperator 308 """ 309 if sep is None: 310 # I found it was too easy to make a mistake with a default 311 raise TypeError("Must specify a seperator (the 'sep' parameter)") 312 313 assert "\r" not in sep and "\n" not in sep, \ 314 "cannot use %s as a seperator" % (repr(seperator),) 315 316 exp = Rep(AnyBut(sep + "\r\n")) 317 return _group(name, exp, attrs) + Str(sep)
318
319 -def UntilSep(name = None, sep = None, attrs = None):
320 """match all characters up to the given seperators(s) 321 322 This is useful for parsing space, tab, color, or other character 323 delimited fields. There is no default seperator. 324 325 If 'name' is not None, the matching text, except for the seperator 326 will be put inside a group of the given name. You can optionally 327 include group attributes. The seperator character will not be 328 consumed. 329 330 Neither "\\r" nor "\\n" may be used as a seperator. 331 """ 332 if sep is None: 333 # I found it was too easy to make a mistake with a default 334 raise TypeError("Must specify a seperator (the 'sep' parameter)") 335 336 assert "\r" not in sep and "\n" not in sep, \ 337 "cannot use %s as a seperator" % (repr(sep),) 338 339 exp = Rep(AnyBut(sep + "\r\n")) 340 return _group(name, exp, attrs)
341 342
343 -def DelimitedFields(name = None, sep = None, attrs = None):
344 """match 0 or more fields seperated by the given seperator(s) 345 346 This is useful for parsing space, tab, color, or other character 347 delimited fields. There is no default seperator. 348 349 If 'name' is not None, the delimited text, excluding the seperator, 350 will be put inside groups of the given name. You can optionally 351 include group attributes. The seperator character is consumed, 352 but not accessible using a group. 353 354 Neither "\\r" nor "\\n" may be used as a seperator. 355 The line as a whole is not included in a group. 356 """ 357 if sep is None: 358 # I found it was too easy to make a mistake with a default 359 raise TypeError("Must specify a sep (via the 'sep' parameter)") 360 361 assert "\r" not in sep and "\n" not in sep, \ 362 "cannot use %s as a seperator" % (repr(sep),) 363 364 term = _group(name, Rep(AnyBut(sep + "\r\n")), attrs) 365 rep = Rep(Any(sep) + term) 366 return term + rep + AnyEol()
367 368 # Allows some optimizations 369 FastFeature = Expression.FastFeature 370 371 372 # Used when making parsers which read a record at a time 373 ParseRecords = Expression.ParseRecords 374 HeaderFooter = Expression.HeaderFooter 375 376 # Use this to prune out group names you aren't 377 # interested in seeing, which reduces the number of method 378 # calls back to the parser.
379 -def select_names(expression, names):
380 # Make a copy so I know I don't share subexpressions which other 381 # expressions. 382 exp = expression.copy() 383 384 # Use that internal method I told you not to use :) 385 exp._select_names(names) 386 387 # Get rid of unnamed groups 388 import optimize 389 return optimize.optimize_unnamed_groups(exp)
390
391 -def replace_groups(expr, replacements):
392 expr = expr.copy() 393 for tagname, replacement_expr in replacements: 394 matches = expr._find_groups(tagname) 395 for match in matches: 396 match.expression = replacement_expr 397 return expr
398
399 -def SimpleRecordFilter(expr, make_reader, reader_args = ()):
400 return ParseRecords("dataset", {"format": "*filter*"}, 401 Group("record", expr + Rep(ToEol())), 402 make_reader, reader_args)
403