Package Martel :: Module Generate
[hide private]
[frames] | no frames]

Source Code for Module Martel.Generate

  1  # Copyright 2000-2001, Dalke Scientific Software, LLC 
  2  # Distributed under the Biopython License Agreement (see the LICENSE file). 
  3   
  4  # Create the tables needed for mxTextTools 
  5   
  6  import string 
  7   
  8  try: 
  9      from mx import TextTools as TT 
 10  except ImportError: 
 11      import TextTools as TT 
 12   
 13  import msre_parse  # Modified version of Secret Labs' sre_parse 
 14  import Expression, convert_re 
 15   
 16  import Parser 
 17   
 18  # This was added with mxTextTools 1.2 
 19  supports_lookahead = hasattr(TT, "LookAhead") 
 20   
 21  # NOTE: 
 22  #  I use some notation in the comments below, which I'll define here 
 23   
 24  #   "+1/-1 trick" 
 25  #  mxTextTools doesn't like "Call"s which don't consume any 
 26  #  characters.  To get around that, I have successful matches return 
 27  #  the end position + 1, so that it seems to consume a character. 
 28  #  Then I Skip -1 characters to get back to the correct loction. 
 29  #    mxTextTools 1.2 and later has a 'LookAhead' command so this 
 30  #  trick is not needed for that version 
 31   
 32  #    ">ignore" 
 33  #  Sometimes the tagtables descends into another tagtable. 
 34  #  mxTextTools requires there be a tag name for all such descents, for 
 35  #  otherwise it cannot create the taglist for the descent.  If there 
 36  #  isn't a name, the fake name ">ignore" is used, which is an illegal 
 37  #  XML name.  The Parser knows to ignore this tag when generating 
 38  #  callbacks. 
 39   
 40   
 41  # With the current implementations, each of the generate_* functions 
 42  # return a Parser with a tagtable that starts parsing with the first 
 43  # element in the table and succeeds by jumping one past the last 
 44  # element. 
 45   
 46  # This is single threaded! 
 47  _generate_count = 0 
 48   
49 -class GeneratorState:
50 - def __init__(self, groupref_names, debug_level):
51 self.groupref_names = groupref_names 52 self.debug_level = debug_level 53 self.lookup = {}
54
55 - def add_group(self, group):
56 name = group.name 57 attrs = group.attrs 58 tag = self.new_group_tag() 59 self.lookup[tag] = (name, attrs) 60 return tag
61
62 - def new_group_tag(self):
63 global _generate_count 64 i = _generate_count 65 tag = ">G%d" % i 66 i = i + 1 67 _generate_count = i 68 return tag
69 70 71 # ============== 72 73 # a|b|c|d|... 74 # Implemented by creating N subtables. If table i fails, try i+i. If 75 # it succeeds, jump to 1 past the end. 76 # 1. try table1: on fail +1, on success +N+1 77 # 2. try table2: on fail +1, on success +N 78 # 3. try table3: on fail +1, on success +N-1 79 # ... 80 # N. try tableN: on fail +1, on success +2 81 # N+1. Fail 82 # N+2. <past the end of table> 83 # 84 # XXX Don't need to create a Table. Could use a single tagtable by 85 # merging into one table and retargeting "succeeded" transitions 86 # (which jumped to the end+1 of the given table) to transition to the 87 # end of the merged table. 88 #
89 -def generate_alt(expression, genstate):
90 # Get the tagtables for each of the subexpressions 91 tables = [] 92 for expr in expression.expressions: 93 tables.append(_generate(expr, genstate)) 94 95 # Make a set of branches testing each one 96 i = 0 97 n = len(tables) 98 result = [] 99 for table in tables: 100 result.append( \ 101 (">ignore", TT.Table, tuple(table), +1, n-i+1) 102 ) 103 i = i + 1 104 # Jumps here 105 result.append( (None, TT.Fail, TT.Here) ) 106 return result
107 108 # sequence of successive regexps: abcdef... 109 # 110 # Simply catenate the tagtables together, in order. Works because falling 111 # off the end of one table means success for that table, so try the next.
112 -def generate_seq(expression, genstate):
113 result = [] 114 if genstate.debug_level == 0: 115 for exp in expression.expressions: 116 table = _generate(exp, genstate) 117 result.extend(table) 118 elif genstate.debug_level == 1: 119 for exp in expression.expressions: 120 table = _generate(exp, genstate) 121 result.extend(table) 122 result.append( (None, TT.Call, track_position, +1, +1) ) 123 elif genstate.debug_level == 2: 124 for exp in expression.expressions: 125 table = _generate(exp, genstate) 126 result.extend(table) 127 result.append( (None, TT.Call, track_position, +1, +1) ) 128 table.append( (None, TT.Call, print_info(exp), +1, +1) ) 129 130 return result
131 132 # A literal character, or a character which isn't the given character
133 -def generate_literal(expression, genstate):
134 if expression.invert: 135 # Can't use "InNot" since it can match EOF 136 return [(None, TT.IsIn, convert_re.invert(expression.char))] 137 138 else: 139 return [(None, TT.Is, expression.char)]
140 141 # A string
142 -def generate_str(expression, genstate):
143 return [(None, TT.Word, expression.string)]
144 145 # Any character in the given list of characters
146 -def generate_any(expression, genstate):
147 if expression.invert: 148 # I don't use "IsNotIn" since that allows EOF 149 return [(None, TT.IsIn, convert_re.invert(expression.chars))] 150 151 else: 152 return [(None, TT.IsIn, expression.chars)]
153 154 # Support class for group matches. Stores the contents of a match so 155 # it can be used by a future back reference or named group repeat.
156 -class SetGroupValue:
157 - def __init__(self, name):
158 self.name = name
159 - def __call__(self, taglist, text, l, r, subtags):
160 taglist.append( (self.name, l, r, subtags) ) 161 Parser._match_group[self.name] = text[l:r]
162 163 # A 'group', either named or unnamed
164 -def generate_group(expression, genstate):
165 tagtable = _generate(expression.expression, genstate) 166 167 name = expression.name 168 if name is None: 169 assert not expression.attrs, "unnamed group can't have attrs!" 170 # Don't really need to do anything, do I? 171 return tagtable 172 173 if genstate.groupref_names.get(name) != 1: 174 if expression.attrs: 175 name = genstate.add_group(expression) 176 return [(name, TT.Table, tuple(tagtable)) ] 177 else: 178 # generate the code to save the match information 179 if expression.attrs: 180 name = genstate.add_group(expression) 181 return [(SetGroupValue(name), TT.Table+TT.CallTag, 182 tuple(tagtable)), ]
183 184 # Uglyness to handle named group repeats. 185 186 # There are two entry points for this class. One is 'call', which is 187 # called by the TextTools 'Call' tag command to test for a match. The 188 # other is 'calltag', which is called by the TextTools 'CallTag' tag 189 # command after a match is found. 190 191 # Uses the +1/-1 trick since the match can be of size 0. 192 193 # Store the taglist on success as an instance variable so the 194 # "CallTag" can append the matches. Again, this is SINGLE-THREADED 195 # CODE, but the best I can do with the current version of mxTextTools. 196
197 -class HandleRepeatCount:
198 - def __init__(self, tagtable, min_count, max_count):
199 self.tagtable = tagtable # The regexp to match 200 self.min_count = min_count # For now, min_count and max_count 201 self.max_count = max_count # must be the same string 202 203 self.taglist = None
204
205 - def _get_ranges(self):
206 # Look up the min/max counts and convert to integers 207 min_count = self.min_count 208 if type(min_count) == type(""): 209 min_count = Parser._match_group[min_count] # group must already exist! 210 min_count = string.atoi(min_count) # requires an integer! 211 212 max_count = self.max_count 213 if type(max_count) == type(""): 214 max_count = Parser._match_group[max_count] # group must already exist! 215 max_count = string.atoi(max_count) # requires an integer! 216 217 return min_count, max_count
218
219 - def call(self, text, x, end):
220 # Called by 'TextTools.Call' to detect a match. 221 # I do the full match here and store the results for later use. 222 # If successful, I return x+1, else return x+0 (the +1/-1 trick) 223 min_count, max_count = self._get_ranges() 224 assert min_count == max_count, \ 225 "cannot have different sizes: %s %s" % (min_count, max_count) 226 227 tagtable = self.tagtable * min_count 228 result, taglist, pos = TT.tag(text, tagtable, x, end) 229 if result == 1: 230 # Store the taglist for later use 231 self.taglist = taglist 232 return pos + 1 # +1 because {0} is allowed; Skip -1 later 233 else: 234 self.taglist = None 235 return x
236
237 - def calltag(self, taglist, text, l, r, subtags):
238 # Called by 'CallTag' to say I matched the above 'call' 239 assert not subtags, repr(subtags) 240 # Append the taglist which we saved from an earlier good match. 241 # The '-1' is because of the +1/-1 hack. 242 taglist.append( (">ignore", l, r-1, self.taglist) )
243 244 245 # These objects forward calls to HandleRepeatCount methods. 246 # Would like to start the methods directly but methods cannot be pickled. 247
248 -class _call_calltag:
249 - def __init__(self, obj):
250 self.obj = obj
251 - def __call__(self, taglist, text, l, r, subtags):
252 return self.obj.calltag(taglist, text, l, r, subtags)
253
254 -class _call_call:
255 - def __init__(self, obj):
256 self.obj = obj
257 - def __call__(self, text, x, end):
258 return self.obj.call(text, x, end)
259
260 -def generate_named_max_repeat(expression, genstate):
261 if type(expression.min_count) != type("") or \ 262 type(expression.max_count) != type(""): 263 raise NotImplementedError("Cannot mix numeric and named repeat counts") 264 if expression.min_count != expression.max_count: 265 raise NotImplementedError("Only a single named repeat count allowed") 266 267 tagtable = _generate(expression.expression, genstate) 268 counter = HandleRepeatCount(tuple(tagtable), 269 expression.min_count, 270 expression.max_count) 271 272 # You see the +1/-1 trick implemented here 273 # Don't use the 'supports_lookahead' here since that complicates matters 274 return \ 275 [(_call_calltag(counter), TT.Call + TT.CallTag, 276 _call_call(counter), TT.MatchFail), 277 (None, TT.Skip, -1, TT.MatchFail),]
278 279 # It isn't as bad as it looks :) 280 # Basically, call some other code for named group repeats. 281 # Everything else is of the form {i,j}. 282 # Get the subexpression table: 283 # generate i copies which must work 284 # generate j-i copies which may work, but fail okay. 285 # special case when j == 65535, which standard for "unbounded"
286 -def generate_max_repeat(expression, genstate):
287 expr = expression.expression 288 min_count = expression.min_count 289 max_count = expression.max_count 290 291 # Call some other code for named group repeats 292 if type(min_count) == type("") or type(max_count) == "": 293 return generate_named_max_repeat(expression, genstate) 294 295 assert 0 <= min_count <= max_count, "bad ranges (%sd, %d)" % \ 296 (min_count, max_count) 297 298 tagtable = _generate(expr, genstate) 299 result = [] 300 301 # Must repeat at least "i" times. 302 for i in range(min_count): 303 result.append( (None, TT.SubTable, tuple(tagtable)) ) 304 305 # Special case for when the max count means "unbounded" 306 if max_count == msre_parse.MAXREPEAT: 307 result.append( (None, TT.SubTable, tuple(tagtable), 308 +1, 0)) 309 elif min_count == max_count: 310 # Special case when i == j 311 pass 312 else: 313 # Generate j-i more tagtables, but allow failures 314 offset = max_count - min_count 315 for i in range(offset): 316 result.append( (">ignore", TT.Table, 317 tuple(tagtable), 318 +offset, +1) ) 319 offset = offset - 1 320 return result
321 322 # Doesn't do anything
323 -def generate_null_op(expression, genstate):
324 return []
325 333
334 -def generate_debug(expression, genstate):
335 return [(None, TT.Call, print_debug(expression.msg), +1, +1)]
336 337 # XXX Is this correct? This is the multiline behaviour which allows 338 # "^" to match the beginning of a line.
339 -def check_at_beginning(text, x, end):
340 if x == 0: 341 return x 342 if text[x-1] == "\n": 343 return x 344 return x + 1
345 346 # XXX Consider this code broken! 347 # 348 # Uses the +1/-1 trick.
349 -def generate_at_beginning(expression, genstate):
350 if supports_lookahead: 351 return [(None, TT.Call + TT.LookAhead, check_at_beginning, +1, 352 TT.MatchFail),] 353 else: 354 return [(None, TT.Call, check_at_beginning, +2, +1), 355 (None, TT.Skip, -1, TT.MatchFail, TT.MatchFail), 356 ]
357 358 359 # XXX Consider this code broken! 360 # 361 # XXX If 'check_at_beginning' is correct, then this is wrong since it 362 # doesn't implement the multiline behaviour.
363 -def generate_at_end(expression, genstate):
364 return [(None, TT.EOF, TT.Here)]
365 366 367 # Match any character except newline (by which I mean just "\012")
368 -def generate_dot(expression, genstate):
369 return [(None, TT.IsInSet, TT.invset('\n')), ]
370 371 # Match any of the three standard newline conventions
372 -def generate_eol(expression, genstate):
373 return [(None, TT.Is, '\n', +1, +3), 374 (None, TT.Is, '\r', TT.MatchFail, +1), 375 (None, TT.Is, '\n', +1, +1), 376 ]
377 378 # Used during a negative lookhead test. 379 # Note the +1/-1 trick.
380 -def check_assert_not(text, x, end, tagtable):
381 result, taglist, pos = TT.tag(text, tagtable, x, end) 382 if result: 383 # This failed 384 return x 385 # On success, move forward 1, to be removed later 386 return x + 1
387 388 # Called by the 'Call' tag command when doing negative lookaheads
389 -class CheckAssertNot:
390 - def __init__(self, tag_words):
391 self.tag_words = tag_words
392 - def __call__(self, text, x, end):
393 pos = check_assert_not(text, x, end, self.tag_words) 394 return pos
395 396 397 # Used during a positive lookhead test. 398 # Note the +1/-1 trick.
399 -def check_assert(text, x, end, tag_words):
400 result, taglist, pos = TT.tag(text, tag_words, x, end) 401 if result: 402 # This succeeded, move forward 1, to be removed later 403 return x+1 404 # failed 405 return x
406 407 # Called by the 'Call' tag command when doing negative lookaheads
408 -class CheckAssert:
409 - def __init__(self, tag_words):
410 self.tag_words = tag_words
411 - def __call__(self, text, x, end):
412 pos = check_assert(text, x, end, self.tag_words) 413 return pos
414 415 # Create the tagtable for doing a lookahead assertion. 416 # Uses the +1/-1 trick.
417 -def generate_assert(expression, genstate):
418 tagtable = _generate(expression.expression, genstate) 419 if expression.invert: 420 func = CheckAssertNot 421 else: 422 func = CheckAssert 423 if supports_lookahead: 424 return [ 425 (None, TT.Call + TT.LookAhead, func(tuple(tagtable)), 426 TT.MatchFail), 427 ] 428 else: 429 return [ 430 (None, TT.Call, func(tuple(tagtable)), 431 TT.MatchFail), 432 (None, TT.Skip, -1, TT.MatchFail), 433 ]
434 435 # Create an object which can be called by a 'Call' tag command to 436 # match the text found by the named group. 437 # Uses the +1/-1 trick.
438 -class CheckGroupRef:
439 - def __init__(self, name):
440 self.name = name
441 - def __call__(self, text, x, end):
442 match_text = Parser._match_group[self.name] # group name not yet found 443 if text[x:x+len(match_text)] != match_text: 444 return x 445 return x+len(match_text)+1
446 447 # Make the tagtable needed for a named group backreference. 448 # Uses the +1/-1 trick.
449 -def generate_groupref(expression, genstate):
450 # Look up the string from the match group 451 # It can be of length 0 or more, so use the +1/-1 trick. 452 return [ 453 (None, TT.Call, CheckGroupRef(expression.name), TT.MatchFail), 454 (None, TT.Skip, -1, TT.MatchFail), 455 ]
456 457 # Used to define parsers which read a record at time. They contain no 458 # parse information themselves, but only in their children
459 -def generate_pass_through(expression, genstate):
460 return _generate(expression.expression, genstate)
461 462 # Mapping from Expression type to function used to generate the tagtable 463 generate_table = { 464 Expression.Alt: generate_alt, 465 Expression.Any: generate_any, 466 Expression.Assert: generate_assert, 467 Expression.AtBeginning: generate_at_beginning, 468 Expression.AtEnd: generate_at_end, 469 Expression.Debug: generate_debug, 470 Expression.Dot: generate_dot, 471 Expression.AnyEol: generate_eol, 472 Expression.Group: generate_group, 473 Expression.GroupRef: generate_groupref, 474 Expression.Literal: generate_literal, 475 Expression.MaxRepeat: generate_max_repeat, 476 Expression.NullOp: generate_null_op, 477 Expression.Seq: generate_seq, 478 Expression.Str: generate_str, 479 } 480 481 _position = -1
482 -def track_position(text, x, end):
483 """store the start position of the farthest successful match 484 485 This value is more useful than mxTextTools' default, which only 486 points out the last text region successfully tagged at the top 487 level. This value is the last region successfully tagged 488 anywhere. 489 490 Uses a global variable so this is SINGLE THREADED! 491 492 """ 493 494 global _position 495 _position = max(x, _position) 496 return x
497 509 510 # The internal recursive call
511 -def _generate(expression, genstate):
512 try: 513 func = generate_table[expression.__class__] 514 except KeyError: 515 if isinstance(expression, Expression.PassThrough): 516 func = generate_pass_through 517 else: 518 raise AssertionError, \ 519 "Unknown Expression object: %s" % repr(expression) 520 table = func(expression, genstate) 521 522 if genstate.debug_level == 0 or not table: 523 pass 524 elif genstate.debug_level == 1: 525 table.append( (None, TT.Call, track_position, +1, +1) ) 526 elif genstate.debug_level == 2: 527 table.append( (None, TT.Call, track_position, +1, +1) ) 528 table.append( (None, TT.Call, print_info(expression), +1, +1) ) 529 else: 530 raise AssertionError, "Unknown debug level: %s" % genstate.debug_level 531 532 return table
533 534 # Get the tagtable and the want_groupref_names 535 # Main entry point for record oriented readers
536 -def generate(expression, debug_level = 0):
537 """expression -> Parser for the Expression tree""" 538 groupref_names = _find_wanted_groupref_names(expression) 539 genstate = GeneratorState(groupref_names, debug_level) 540 tagtable = _generate(expression, genstate) 541 if groupref_names: 542 want_groupref_names = 1 543 else: 544 want_groupref_names = 0 545 return tuple(tagtable), want_groupref_names, genstate.lookup
546 547 # Get the parser. Main entry point for everything except record 548 # oriented readers
549 -def generate_parser(expression, debug_level = 0):
550 tagtable, want_groupref_names, attrlookup = generate(expression) 551 return Parser.Parser(tagtable, (want_groupref_names, debug_level, 552 attrlookup))
553 554 555 # This code is ugly. There are couple possible fixes: 556 # 1) write a tree iterator which visits each node of the Expression tree. 557 # Using it would remove the extra code for doing recursion. 558 # 2) add a new method to each Expression type to get the names. 559 # Using it would remove the many 'if' statements. 560 # 561 # There will probably be code in the future to optimize an Expression 562 # tree. That code will have tools to manipulate a tree. I'll wait 563 # until then to work decide which option to take, or if there's a 564 # third. 565
566 -def _find_wanted_groupref_names(expression):
567 """expression -> dict of group names wanted by elements of the tree 568 569 The dict is used to during tagtable generation to specify which 570 groups need to save their match text. There's match-time overhead 571 for doing that, and the code isn't thread safe, so the intent is 572 to save only those groups that are needed. 573 574 The dict value is 1 if the group name is needed, else there is 575 no entry in the dict. 576 577 XXX need to make this a method! 578 """ 579 want_names = {} 580 if isinstance(expression, Expression.Alt) or \ 581 isinstance(expression, Expression.Seq): 582 for x in expression.expressions: 583 want_names.update(_find_wanted_groupref_names(x)) 584 585 elif isinstance(expression, Expression.Group) or \ 586 isinstance(expression, Expression.Assert) or \ 587 isinstance(expression, Expression.PassThrough): 588 want_names.update(_find_wanted_groupref_names(expression.expression)) 589 590 elif isinstance(expression, Expression.MaxRepeat): 591 if type(expression.min_count) == type(""): 592 want_names[expression.min_count] = 1 593 if type(expression.max_count) == type(""): 594 want_names[expression.max_count] = 1 595 want_names.update(_find_wanted_groupref_names(expression.expression)) 596 597 elif isinstance(expression, Expression.GroupRef): 598 want_names[expression.name] = 1 599 600 elif isinstance(expression, Expression.Literal) or \ 601 isinstance(expression, Expression.Str) or \ 602 isinstance(expression, Expression.Any) or \ 603 isinstance(expression, Expression.AtBeginning) or \ 604 isinstance(expression, Expression.AtEnd) or \ 605 isinstance(expression, Expression.Dot) or \ 606 isinstance(expression, Expression.AnyEol) or \ 607 isinstance(expression, Expression.Debug) or \ 608 isinstance(expression, Expression.NullOp): 609 pass 610 611 else: 612 raise NotImplementedError, "What is a %s?" % repr(expression) 613 614 return want_names
615