1
2
3
4 """Martel is a 'regular expressions on steroids' parser generator (DEPRECATED).
5
6 A goal of the Biopython project is to reduce the amount of effort needed to
7 do computational biology. A large part of that work turns out to be parsing
8 file formats, which lead to the development of Martel, a parser generator
9 which uses a regular expression as the format description to create a parser
10 that returns the parse tree using the SAX API common in XML processing.
11
12 While intended to be both fast and relatively easy to understand, Martel did
13 struggle with some very large records (e.g. GenBank files for whole genomes or
14 chromosomes), and in practice debugging the Martel format specifications for
15 evolving file formats like GenBank proved non-trivial.
16
17 Andrew Dalke is no longer maintaining Martel or Bio.Mindy, and these modules
18 are now deprecated. They are no longer used in any of the current Biopython
19 parsers, and are likely to be removed in a future release of Biopython.
20 """
21 __version__ = "1.50"
22
23 import warnings
24 warnings.warn("Martel and those parts of Biopython depending on it" \
25 +" directly, such as Bio.Mindy, are now deprecated, and will" \
26 +" be removed in a future release of Biopython. If you want"\
27 +" to continue to use this code, please get in contact with"\
28 +" the Biopython developers via the mailing lists to avoid"\
29 +" its permanent removal from Biopython", \
30 DeprecationWarning)
31
32 import Expression
33 import convert_re
34 import string
35 from xml.sax import xmlreader
36
37
38
42
44 """(s1, s2, ...) -> match s1 or s2 or ..."""
45 if len(args) == 1:
46 return Str1(args[0])
47 return Expression.Alt(tuple(map(Str, args)))
48
54
56 """s -> match any character not in s"""
57 return Expression.Any(s, 1)
58
59
60
61
62
63
64
66 """exp1, exp2, ... -> match exp1 followed by exp2 followed by ..."""
67
68
69 for arg in args:
70 assert isinstance(arg, Expression.Expression), \
71 "expecting an Expression, not a %s" % type(arg)
72
73 return Expression.Seq(args)
74
76 """exp1, exp2, ... -> match exp1 or (if that fails) match exp2 or ..."""
77
78 for arg in args:
79 assert isinstance(arg, Expression.Expression), \
80 "expecting an Expression, not a %s" % type(arg)
81 return Expression.Alt(args)
82
88
94
96 """expr -> match 'expr' as many times as possible, but at least once"""
97 assert isinstance(expr, Expression.Expression), \
98 "expecting an Expression, not a %s" % type(expr)
99 return Expression.MaxRepeat(expr, 1)
100
101
102
103
104 NoCase = Expression.NoCase
105
107 raise NotImplementedError
108
110 raise NotImplementedError
111
113 raise NotImplementedError
114
116 raise NotImplementedError
117
121
122
123
124
125 AnyEol = Expression.AnyEol
126
128 """expr, min_count, max_count = 65535 -> match between min- and max_count times
129
130 If max_count == 65535 (which is Expression.MAXREPEAT) then there
131 is no upper limit.
132 """
133 assert isinstance(expr, Expression.Expression), \
134 "expecting an Expression, not a %s" % type(expr)
135 return Expression.MaxRepeat(expr, min_count, max_count)
136
137 -def RepN(expr, count):
138 """expr, count -> match the expression 'count' number of time
139
140 This option is handy for named group repeats since you don't have
141 to use the name twice; for the min_count and max_count fields.
142 """
143 return Expression.MaxRepeat(expr, count, count)
144
145
146 -def Group(name, expr, attrs = None):
151
152
154
155
156 s = string.replace(s, "\r\n", "\n")
157 s = string.replace(s, "\r", "\n")
158 return string.replace(s, "\n", r"\R")
159
160
161 -def Re(pattern, fix_newlines = 0):
166
167 NullOp = Expression.NullOp
168 Debug = Expression.Debug
169
172
175
176
182
183 -def Digits(name = None, attrs = None):
184 """match one or more decimal digits
185
186 This is the same as (?P<name?attrs>\d+).
187
188 If 'name' is not None, the matching text will be put inside a
189 group of the given name. You can optionally include group
190 attributes.
191 """
192 return _group(name, Re(r"\d+"), attrs)
193
194 -def Integer(name = None, attrs = None):
195 """match an integer (digits w/ optional leading + or - sign)
196
197 If 'name' is not None, the matching text will be put inside a
198 group of the given name. You can optionally include group
199 attributes.
200 """
201 exp = Re(r"[+-]?\d+")
202 return _group(name, exp, attrs)
203
204 -def Float(name = None, attrs = None):
205 """match floating point numbers like 6, 6., -.1, 2.3, +4E-5, ...
206
207 If 'name' is not None, the matching text will be put inside of a
208 group of the given name. You can optionally include group
209 attributes.
210 """
211 exp = Re(r"[+-]?((\d+(\.\d*)?)|\.\d+)([eE][+-]?[0-9]+)?")
212 return _group(name, exp, attrs)
213
214 -def Word(name = None, attrs = None):
215 """match a 'word'
216
217 A 'word' is defined as '\w+', and \w is [a-zA-Z0-9_].
218
219 If 'name' is not None, the matching text will be put inside of a
220 group of the given name. You can optionally include group
221 attributes.
222
223 In other words, this is the short way to write (?P<name>\w+).
224 """
225 exp = Re(r"\w+")
226 return _group(name, exp, attrs)
227
228 -def Spaces(name = None, attrs = None):
229 """match one or more whitespace (except newline)
230
231 "Spaces" is defined as [\\t\\v\\f\\r ]+, which is *not* the same
232 as '\\s+'. (It's missing the '\\n', which is useful since you
233 almost never mean for whitespace to go beyond the newline.)
234
235 If 'name' is not None, the matching text will be put inside of a
236 group of the given name. You can optionally include group
237 attributes.
238 """
239 exp = Re(r"[\t\v\f ]+")
240 return _group(name, exp, attrs)
241
243 """match an unprintable character (characters not in string.printable)
244
245 If 'name' is not None, the matching text will be put inside of a
246 group of the given name. You can optionally include group
247 attributes.
248 """
249 return _group(name, AnyBut(string.printable), attrs)
250
252 """match a punctuation character (characters in string.punctuation)
253
254 If 'name' is not None, the matching text will be put inside of a
255 group of the given name. You can optionally include group
256 attributes.
257 """
258 return _group(name, Any(string.punctuation), attrs)
259
260
261 -def ToEol(name = None, attrs = None):
262 """match everything up to and including the end of line
263
264 If 'name' is not None, the matching text, except for the newline,
265 will be put inside a group of the given name. You can optionally
266 include group attributes.
267 """
268 if name is None:
269 assert not attrs, "Attributes (%s) require a group name" % (attrs,)
270 return Re(r"[^\R]*\R")
271 else:
272 return Group(name, Re(r"[^\R]*"), attrs) + AnyEol()
273
274 -def UntilEol(name = None, attrs = None):
275 """match everything up to but not including the end of line
276
277 If 'name' is not None, the matching text, except for the newline,
278 will be put inside a group of the given name. You can optionally
279 include group attributes.
280 """
281 if name is None:
282 assert not attrs, "Attributes (%s) require a group name" % (attrs,)
283 return Re(r"[^\R]*")
284 else:
285 return Group(name, Re(r"[^\R]*"), attrs)
286
288 """read and ignore lines up to, but excluding, the line matching expr"""
289 return Rep(AssertNot(expr) + ToEol())
290
292 """read and ignore lines up to and including, the line matching expr"""
293 return Rep(AssertNot(expr) + ToEol()) + expr + ToEol()
294
295
296 -def ToSep(name = None, sep = None, attrs = None):
297 """match all characters up to the given seperator(s)
298
299 This is useful for parsing space, tab, color, or other character
300 delimited fields. There is no default seperator character.
301
302 If 'name' is not None, the matching text, except for the seperator
303 will be put inside a group of the given name. You can optionally
304 include group attributes. The seperator character will also be
305 consumed.
306
307 Neither "\\r" nor "\\n" may be used as a seperator
308 """
309 if sep is None:
310
311 raise TypeError("Must specify a seperator (the 'sep' parameter)")
312
313 assert "\r" not in sep and "\n" not in sep, \
314 "cannot use %s as a seperator" % (repr(seperator),)
315
316 exp = Rep(AnyBut(sep + "\r\n"))
317 return _group(name, exp, attrs) + Str(sep)
318
319 -def UntilSep(name = None, sep = None, attrs = None):
320 """match all characters up to the given seperators(s)
321
322 This is useful for parsing space, tab, color, or other character
323 delimited fields. There is no default seperator.
324
325 If 'name' is not None, the matching text, except for the seperator
326 will be put inside a group of the given name. You can optionally
327 include group attributes. The seperator character will not be
328 consumed.
329
330 Neither "\\r" nor "\\n" may be used as a seperator.
331 """
332 if sep is None:
333
334 raise TypeError("Must specify a seperator (the 'sep' parameter)")
335
336 assert "\r" not in sep and "\n" not in sep, \
337 "cannot use %s as a seperator" % (repr(sep),)
338
339 exp = Rep(AnyBut(sep + "\r\n"))
340 return _group(name, exp, attrs)
341
342
344 """match 0 or more fields seperated by the given seperator(s)
345
346 This is useful for parsing space, tab, color, or other character
347 delimited fields. There is no default seperator.
348
349 If 'name' is not None, the delimited text, excluding the seperator,
350 will be put inside groups of the given name. You can optionally
351 include group attributes. The seperator character is consumed,
352 but not accessible using a group.
353
354 Neither "\\r" nor "\\n" may be used as a seperator.
355 The line as a whole is not included in a group.
356 """
357 if sep is None:
358
359 raise TypeError("Must specify a sep (via the 'sep' parameter)")
360
361 assert "\r" not in sep and "\n" not in sep, \
362 "cannot use %s as a seperator" % (repr(sep),)
363
364 term = _group(name, Rep(AnyBut(sep + "\r\n")), attrs)
365 rep = Rep(Any(sep) + term)
366 return term + rep + AnyEol()
367
368
369 FastFeature = Expression.FastFeature
370
371
372
373 ParseRecords = Expression.ParseRecords
374 HeaderFooter = Expression.HeaderFooter
375
376
377
378
390
392 expr = expr.copy()
393 for tagname, replacement_expr in replacements:
394 matches = expr._find_groups(tagname)
395 for match in matches:
396 match.expression = replacement_expr
397 return expr
398
400 return ParseRecords("dataset", {"format": "*filter*"},
401 Group("record", expr + Rep(ToEol())),
402 make_reader, reader_args)
403