1
2
3
4 """Classes for nodes in the Expression tree.
5
6 Expression
7 |--- Any - match (or don't match) a set of characters
8 |--- AnyEol - match any newline representation ("\n", "\r" or "\r\n")
9 |--- Assert - used for positive and negative lookahead assertions
10 |--- AtBeginning - match the beginning of a line
11 |--- AtEnd - match the end of a line
12 |--- Debug - print a debug message
13 |--- Dot - match any character except newline
14 |--- Group - give a group name to an expression
15 |--- GroupRef - match a previously identified expression
16 |--- Literal - match (or don't match) a single character
17 |--- MaxRepeat - greedy repeat of an expression, within min/max bounds
18 |--- NullOp - does nothing (useful as an initial seed)
19 |--- PassThrough - used when overriding 'make_parser'; match its subexp
20 | |--- FastFeature - keeps information about possibly optional tags
21 | |--- HeaderFooter - files with a header, records and a footer
22 | `--- ParseRecords - parse a record at a time
23 |--- Str - match a given string
24 `--- ExpressionList - expressions containing several subexpressions
25 |--- Alt - subexp1 or subexp2 or subexp3 or ...
26 `--- Seq - subexp1 followed by subexp2 followed by subexp3 ...
27 """
28 import re, string
29 from xml.sax import xmlreader
30 import msre_parse
31 import Parser
32
33 MAXREPEAT = msre_parse.MAXREPEAT
34
35 try:
36 import IterParser
37 except SyntaxError:
38 IterParser = None
39
41 """Base class for nodes in the Expression tree"""
43 """returns an Expression to match this Expression then the other one"""
44 assert isinstance(other, Expression), "RHS of '+' not an Expression"
45 return Seq( (self, other) )
46
48 """returns an Expression matching this Expression or (if that fails) the other one"""
49 assert isinstance(other, Expression), "RHS of '|' not an Expression"
50 return Alt( (self, other) )
51
53 """the list of group names used by this Expression and its children"""
54 return ()
55
57 """return a list of all groups matching the given tag"""
58 return []
59
61 """return a list of all features"""
62 return []
63
65 """internal function used by 'select_names'.
66
67 Don't call this function. Will likely be removed in future versions.
68 """
69
70 pass
71
73 """do a deep copy on this Expression tree"""
74 raise NotImplementedError
75
77 """the corresponding pattern string"""
78 raise NotImplementedError
79
81 """create a SAX compliant parser for this regexp"""
82 import Generate
83 tagtable, want_flg, attrlookup = Generate.generate(self, debug_level)
84 return Parser.Parser(tagtable, (want_flg, debug_level, attrlookup))
85
90
92 """internal function for manipulating the leaves of an expression
93
94 This really needs to be some sort of visit pattern, but I'm
95 not sure the best way to do it. THIS METHOD MAY CHANGE.
96 """
97 return func(self)
98
99
100 -class Any(Expression):
102 """(chars, invert = 0)
103
104 Match any of the characters appearing in the 'chars' string.
105 If 'invert' is true, match a character not in the string.
106 """
107 self.chars = chars
108 self.invert = invert
109
111 """do a deep copy on this Expression tree"""
112 return Any(self.chars, self.invert)
113
120
121
122
124 - def __init__(self, expression, invert = 0):
125 """(expression, invert = 0)
126
127 A non-consuming assertion using the given expression.
128 The default is a positive lookahead, which matches if the expression
129 matches at the current position, but does not affect the character
130 position.
131
132 If 'invert' is false, this is a negative lookahead assertion,
133 and matches if the expression does not match. Again, the character
134 position is not affected.
135 """
136 self.expression = expression
137 self.invert = invert
138
140 """do a deep copy on this Expression tree"""
141 return Assert(self.expression.copy(), self.invert)
142
144 """the corresponding pattern string"""
145 if self.invert:
146 return '(?!%s)' % str(self.expression)
147 else:
148 return '(?=%s)' % str(self.expression)
149
151 exp = self.expression._modify_leaves(func)
152 assert exp is not None
153 self.expression = exp
154 return self
155
156
157
158
159
160
162 """Match the beginning of a line"""
164 """do a deep copy on this Expression tree"""
165 return AtBeginning()
167 """the corresponding pattern string"""
168 return '^'
169
170
171
172
173
175 """Match the end of a line"""
177 """do a deep copy on this Expression tree"""
178 return AtEnd()
180 """the corresponding pattern string"""
181 return '$'
182
183
184
192 """do a deep copy on this Expression"""
193 return Debug(self.msg)
194
195
196 -class Dot(Expression):
197 """Match any character except newline"""
199 """do a deep copy on this Expression tree"""
200 return Dot()
202 """the corresponding pattern string"""
203 return '.'
204
205
207 """Match a newline ("\n", "\r" or "\r\n")"""
209 """do a deep copy on this Expression tree"""
210 return AnyEol()
212 """the corresponding pattern string"""
213 return r"(\n|\r\n?)"
214
215
216
217
218
219
220
222 assert s, "Group name can not be the empty string"
223 if not msre_parse.isname(s):
224 raise AssertionError, "Illegal character in group name %s" % repr(s)
225
226 _fast_quote_lookup = None
240
250
252 if name is None:
253 return '(%s)' % str(expression)
254
255 elif attrs:
256
257 terms = []
258 for k, v in attrs.items():
259 terms.append("%s=%s" % (_quote(k), _quote(v)))
260 attrname = name + "?" + string.join(terms, "&")
261 return '(?P<%s>%s)' % (attrname, str(expression))
262 else:
263 return '(?P<%s>%s)' % (name, str(expression))
264
266 - def __init__(self, name, expression, attrs = None):
267 """(name, expression)
268
269 Create a group named 'name' which matches the given expression
270 """
271 if name is not None:
272 _verify_name(name)
273 self.name = name
274 self.expression = expression
275 if attrs is None:
276 attrs = xmlreader.AttributesImpl({})
277 elif isinstance(attrs, type({})):
278 attrs = xmlreader.AttributesImpl(attrs)
279 self.attrs = attrs
280
282 """the list of group names used by this Expression and its children"""
283 subnames = self.expression.group_names()
284 if self.name is not None:
285 if self.name in subnames:
286 return subnames
287 else:
288 return (self.name, ) + subnames
289 return subnames
290
292 """return a list of all groups matching the given tag"""
293 x = []
294 if self.name == tag:
295 x.append(self)
296 return x + self.expression._find_groups(tag)
297
299 """return a list of all features"""
300 return self.expression.features()
301
303 """internal function: do not use"""
304 if self.name is not None and self.name not in names:
305 self.name = None
306 self.attrs = xmlreader.AttributesImpl({})
307 self.expression._select_names(names)
308
310 exp = self.expression._modify_leaves(func)
311 assert exp is not None
312 self.expression = exp
313 return self
314
316 """do a deep copy on this Expression tree"""
317 return Group(self.name, self.expression.copy(), self.attrs.copy())
318
322
323
324
327 """(name)
328
329 Match the same text previously found by the given named group
330 """
331 _verify_name(name)
332 self.name = name
334 """do a deep copy on this Expression tree"""
335 return GroupRef(self.name)
337 """the corresponding pattern string"""
338 return "(?P=%s)" % self.name
339
340
341
342
343
344
347 """(char, invert = 0)
348
349 Match the given character or, if 'invert' is true, match a character
350 which is not this character.
351 """
352 self.char = char
353 self.invert = invert
354
356 """do a deep copy on this Expression tree"""
357 return Literal(self.char, self.invert)
358
360 """the corresponding pattern string"""
361 c = escape(self.char)
362 if self.invert:
363 return '[^%s]' % c
364 return c
365
366
367
368
369
373 """(expression, min_count = 0, max_count = MAXREPEAT)
374
375 Match the expression at least 'min_count' times and no more
376 than 'max_count' times. If max_count == MAXREPEAT then
377 there is no fixed upper limit.
378
379 min_count and max_count can be strings, in which case they are
380 used as "named group repeats." That is, they are taken to be
381 group names and used to find the repeat counts during
382 evaluation time. The current implementation only understands
383 named group repeats when min_count == max_count.
384
385 The grouping is greedy.
386
387 WARNING: There is no check to ensure that a match of 0 size is
388 repeated indefinitely, as with "(a?)*" against the string "b".
389 This will loop forever.
390
391 WARNING: The current implementation does not support
392 backtracking in MaxRepeats, so ".*\n" will not match "\n".
393 Use a more explicit construct instead, like "[^\n]*\n".
394
395 """
396
397 self.expression = expression
398
399
400 if type(min_count) == type(0):
401 assert 0 <= min_count, \
402 "min_count must be non-negative, not %d" % min_count
403 if type(max_count) == type(0):
404 assert min_count <= max_count, \
405 "min_count (%d) must be <= max_count (%d)" % (min_count, max_count)
406 else:
407 if type(max_count) == type(0):
408 assert max_count >= 0, \
409 "max_count must be non-negative, not %d" % max_count
410
411 self.min_count = min_count
412 self.max_count = max_count
413
415 """the list of group names used by this Expression and its children"""
416
417
418 return self.expression.group_names()
421
423 """return a list of all features"""
424 return self.expression.features()
425
427 """do a deep copy on this Expression tree"""
428 return MaxRepeat(self.expression.copy(), self.min_count,
429 self.max_count)
430
434
436 exp = self.expression._modify_leaves(func)
437 assert exp is not None
438 self.expression = exp
439 return self
440
442 """the corresponding pattern string"""
443 min_count = self.min_count
444 max_count = self.max_count
445 subexp = self.expression
446
447
448
449
450 if isinstance(subexp, ExpressionList):
451 need_group = 1
452 elif isinstance(subexp, Str):
453
454
455 need_group = 1
456 else:
457 need_group = 0
458
459 if need_group:
460 s = "(%s)" % str(subexp)
461 else:
462 s = str(subexp)
463
464
465 if type(min_count) == type("") or type(max_count) == type(""):
466
467 if min_count == max_count:
468 ext = "{%s}" % min_count
469 else:
470 ext = "{%s,%s}" % (min_count, max_count)
471 else:
472
473 if min_count == 0 and max_count == MAXREPEAT:
474 ext = "*"
475 elif min_count == 1 and max_count == MAXREPEAT:
476 ext = "+"
477 elif min_count == 0 and max_count == 1:
478 ext = "?"
479 elif min_count == max_count == 1:
480 ext = ""
481 elif min_count == max_count:
482 ext = "{%d}" % max_count
483 elif min_count == 0:
484 ext = "{,%d}" % max_count
485 elif max_count == MAXREPEAT:
486 ext = "{%d,}" % min_count
487 else:
488 ext = "{%d,%d}" % (min_count, max_count)
489
490 return s + ext
491
492
495 """()
496
497 Doesn't match anything. This is a null operation. It's
498 useful if you want a valid initial object from which to build,
499 as in:
500
501 exp = NullOp()
502 for c in string.split(line):
503 exp = exp + Str(c)
504
505 (That's contrived -- see Time.py for a real use.)
506 """
516 raise TypeError("Cannot 'or' a NullOp with anything (only 'and')")
517
518
519
522 """(expression)
523
524 Match the given subexpression. This class should not be used
525 directly. It is meant for generating specialized parsers which
526 read a record at a time.
527 """
528 self.expression = expression
532 exp = self.expression._modify_leaves(func)
533 assert exp is not None
534 self.expression = exp
535 return self
537 """do a deep copy on this Expression tree"""
538 return PassThrough(self.expression.copy())
540 """the corresponding pattern string"""
541 return str(self.expression)
547 """return a list of all features"""
548 return self.expression.features()
549
551 - def __init__(self, expression, feature, remove_tags):
556 """do a deep copy on this Expression tree"""
557 return FastFeature(self.expression.copy(), self.feature,
558 self.remove_tags[:])
560 return [(self.feature, self.remove_tags)]
561
748
749
751 - def __init__(self, format_name, attrs, record_expression,
752 make_reader, reader_args = ()):
753 PassThrough.__init__(self, Group(format_name,
754 MaxRepeat(record_expression, 1),
755 attrs))
756
757
758
759
760
761 if isinstance(attrs, Expression):
762 raise TypeError("Looks like you need an attrs between the format_name and the record_expression")
763
764 self.format_name = format_name
765 if attrs is None:
766 attrs = xmlreader.AttributesImpl({})
767 elif isinstance(attrs, type({})):
768 attrs = xmlreader.AttributesImpl(attrs)
769 self.attrs = attrs
770 self.record_expression = record_expression
771 self.make_reader = make_reader
772 self.reader_args = reader_args
774 """do a deep copy on this Expression tree"""
775 return ParseRecords(self.format_name, self.attrs,
776 self.record_expression.copy(),
777 self.make_reader, self.reader_args)
778
780 import Generate
781 tagtable, want_flg, attrlookup = Generate.generate(
782 self.record_expression, debug_level)
783
784 return Parser.RecordParser(self.format_name, self.attrs,
785 tagtable, (want_flg, debug_level, attrlookup),
786 self.make_reader, self.reader_args)
787
805
806
808 return self.format_name + self.expression.group_names()
810 assert tag != self.format_name, "can't handle that case"
811 return self.expression._find_groups(tag)
812
814 """return a list of all features"""
815 return self.expression.features()
816
818 exp = self.expression.modify_leaves(func)
819 assert exp is not None
820 return self
821
822
823
824 -class Str(Expression):
826 """(s)
827
828 Match the given string exactly (not as a regexp pattern)
829 """
830 self.string = s
831
833 """do a deep copy on this Expression tree"""
834 return Str(self.string)
835
837 """the corresponding pattern string"""
838 return escape(self.string)
839
840
841
843 """shares implementation used by 'Expressions with subexpressions'"""
862
868 """do a deep copy on this Expression tree"""
869 return self.__class__(map(lambda x: x.copy(), self.expressions))
877
878
879 -class Alt(ExpressionList):
880 """An Expression tree with a list of alternate matches.
881
882 """
884 """(expressions)
885
886 Match one of a list of alternate expressions. The expressions are
887 tested in their input order.
888
889 For example, Alt( (exp1, exp2, exp3) ) means try to match exp1,
890 and if that fails try to match exp2, and if that fails, try to
891 match exp3. If *that* fails, the match failed.
892
893 """
894 if isinstance(expressions, type( [] )):
895 expressions = tuple(expressions)
896 elif isinstance(expressions, Expression):
897 raise TypeError("Must pass in a list of expressions, not just a single one (put it inside of ()s")
898 self.expressions = expressions
899
911
913 """the corresponding pattern string"""
914 return string.join(map(str, self.expressions), '|')
915
916
917
918 -class Seq(ExpressionList):
919 """An Expression matching a set of subexpressions, in sequential order"""
921 """(expressions)
922
923 Match the list of sequential expressions, in order. Each
924 expression starts matching at the point where the previous
925 match finished.
926 """
927 if isinstance(expressions, type( [] )):
928
929 expressions = tuple(expressions)
930 elif isinstance(expressions, Expression):
931 raise TypeError("Must pass in a list of expressions, not just a single one (put it inside of ()s")
932 self.expressions = expressions
933
939
951
952
953
954
955
957 "Escape all non-alphanumeric characters in pattern."
958 result = list(pattern)
959 alphanum=string.letters+'_'+string.digits+" ="
960 for i in range(len(pattern)):
961 char = pattern[i]
962 if char not in alphanum:
963 if char=='\000': result[i] = '\\000'
964 else: result[i] = '\\'+char
965 return string.join(result, '')
966
967
968 _minimize_escape_chars = {
969 "\a": r"\a",
970 "\b": r"\b",
971 "\n": r"\n",
972 "\r": r"\r",
973 "\t": r"\t",
974 "\f": r"\f",
975 "\v": r"\v",
976 "[": "\\[",
977 "]": "\\]",
978 "\\": "\\\\",
979 "^": "\^",
980 }
981
983 """(c) -> into an appropriately escaped pattern for the character"""
984 x = _minimize_escape_chars.get(c)
985 if x is None:
986 if ord(c) < 32:
987 return '\\' + c
988 return c
989 return x
990
1007
1008
1010 """s -> a string useable inside [] which matches all the characters in s
1011
1012 For example, passing in "0123456789" returns "\d".
1013
1014 This code isn't perfect.
1015 """
1016 if not(isinstance(s, type(""))):
1017 s = str(s)
1018 if not s:
1019 return s
1020
1021
1022
1023 has_hyphen = 0
1024 if '-' in s:
1025 has_hyphen = 1
1026 s = string.replace(s, "-", "")
1027
1028
1029 chars = list(s)
1030 chars.sort()
1031 unique = []
1032 prev_c = None
1033 for c in chars:
1034 if c != prev_c:
1035 unique.append(c)
1036 prev_c = c
1037
1038 s = string.join(unique, "")
1039 t = ""
1040 prev = None
1041 prev_pos = 0
1042 pos = 0
1043
1044
1045
1046 for c in unique:
1047 val = ord(c)
1048
1049 if val - 1 != prev:
1050
1051 if prev is None:
1052
1053 prev_pos = 0
1054 else:
1055
1056
1057 if prev_pos == pos - 1:
1058
1059 t = t + _minimize_escape_char(s[prev_pos])
1060 else:
1061
1062 t = t + _minimize_escape_range(s[prev_pos], s[pos-1])
1063 prev_pos = pos
1064 else:
1065
1066 pass
1067
1068 prev = val
1069 pos = pos + 1
1070
1071
1072 if s:
1073 if prev_pos == pos - 1:
1074 t = t + _minimize_escape_char(s[prev_pos])
1075 else:
1076 t = t + _minimize_escape_range(s[prev_pos], s[pos-1])
1077 else:
1078
1079 pass
1080
1081
1082 if has_hyphen:
1083 t = t + '-'
1084
1085
1086 conversions = {
1087 "\\dA-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\xff": r"\w",
1088 }
1089 t = conversions.get(t, t)
1090
1091 return t
1092
1094 """modify an expression in place to remove case dependencies
1095
1096 may return a new top-level node
1097 """
1098 if isinstance(node, Str):
1099 x = NullOp()
1100 s = ""
1101 for c in node.string:
1102 up_c = string.upper(c)
1103 low_c = string.lower(c)
1104 assert c in (up_c, low_c), "how can this be?"
1105 if up_c == low_c:
1106 s = s + c
1107 else:
1108 if s:
1109 x = x + Str(s)
1110 s = ""
1111 x = x + Any(up_c + low_c)
1112 if s:
1113 x = x + Str(s)
1114 return x
1115
1116 if isinstance(node, Any):
1117 s = node.chars
1118 chars = {}
1119 for c in s:
1120 chars[c] = 1
1121 for c in string.upper(s) + string.lower(s):
1122 if not chars.has_key(c):
1123 chars[c] = 1
1124 s = s + c
1125 return Any(s, node.invert)
1126
1127 if isinstance(node, Literal):
1128 c = node.char
1129 up_c = string.upper(c)
1130 low_c = string.lower(c)
1131 if up_c == low_c:
1132 return node
1133 return Any(up_c + low_c, node.invert)
1134
1135 return node
1136
1141