Package Martel :: Module convert_re
[hide private]
[frames] | no frames]

Source Code for Module Martel.convert_re

  1  # Copyright 2000-2001, Dalke Scientific Software, LLC 
  2  # Distributed under the Biopython License Agreement (see the LICENSE file). 
  3   
  4  """Converts a regular expression pattern string into an Expression tree. 
  5   
  6  This is not meant to be an externally usable module. 
  7   
  8  This works by using msre_parse.py to parse the pattern.  The result is 
  9  a tree data structure, where the nodes in the tree are tuples.  The 
 10  first element of the tuple is the name of the node type.  The format 
 11  of the other elements depends on the type. 
 12   
 13  The conversion routine is pretty simple - convert each msre_parse tuple 
 14  node into a Martel Expression node.  It's a recusive implementation. 
 15   
 16  'msre_parse.py' is a modified version of Secret Labs' 'sre_parse.py' 
 17   
 18  """ 
 19   
 20  import string 
 21  import msre_parse, Expression 
 22   
 23  # The msre_parse parser uses a "master pattern object" which keeps 
 24  # track of the mapping from group id to group name.  This is okay for 
 25  # msre_parse because they only track the last group with a given name. 
 26  # I need to get all groups with the same name, so I need a new object 
 27  # which stores them in a list that I can use later. 
 28   
29 -class GroupNames:
30 - def __init__(self):
31 self.flags = 0 32 self.open = [] 33 self.groups = 1 34 self.groupdict = {}
35 - def opengroup(self, name=None):
36 gid = self.groups 37 self.groups = gid + 1 38 if name: 39 self.groupdict[name] = self.groupdict.get(name, []) + [gid] 40 self.open.append(gid) 41 return gid
42 - def closegroup(self, gid):
43 self.open.remove(gid)
44 - def checkgroup(self, gid):
45 return gid < self.groups and gid not in self.open
46
47 - def reverse_name(self, id):
48 """group number -> group name, or None if there is no name""" 49 for key, val in self.groupdict.items(): 50 if id in val: 51 return key
52 # Ignore non-named groups 53 54 55 56 # Convert a 'literal' tuple into a Literal object
57 -def convert_literal(group_names, name, val):
58 return Expression.Literal(chr(val), 0)
59 60 # Convert a 'not_literal' tuple into a Literal object
61 -def convert_not_literal(group_names, name, val):
62 return Expression.Literal(chr(val), 1)
63 64 # Convert an 'at_beginning" tuple into an AtBeginning object 65 # Convert an 'at_end" tuple into an AtEnd object
66 -def convert_at(group_names, name, where):
67 if where == "at_beginning": 68 return Expression.AtBeginning() 69 elif where == "at_end": 70 return Expression.AtEnd() 71 raise AssertionError("Unknown at name: %s" % repr(where))
72 73 # Convert an 'any' tuple into a Dot object
74 -def convert_any(group_names, name, ignore):
75 assert ignore is None, "what does it mean when the field is '%s'?" % ignore 76 return Expression.Dot()
77 78 # Convert an 'assert' tuple into a Assert object, as a positive assertion
79 -def convert_assert(group_names, name, (direction, terms)):
80 assert direction == 1, "does not support lookbehind" 81 return Expression.Assert(convert_list(group_names, terms), 0)
82 83 # Convert an 'assert_not' tuple into a Assert object, as a negative assertion
84 -def convert_assert_not(group_names, name, (direction, terms)):
85 assert direction == 1, "does not support lookbehind" 86 return Expression.Assert(convert_list(group_names, terms), 1)
87 88 89 # Convert a 'branch' tuple into an Alt object
90 -def convert_branch(group_names, name, (ignore, branches)):
91 assert ignore is None, "what is %s?" % repr(ignore) 92 results = [] 93 for branch in branches: 94 results.append(convert_list(group_names, branch)) 95 if len(results) == 1: 96 return results[0] 97 return Expression.Alt(tuple(results))
98 99 # I know, it's only good for ASCII...
100 -def invert(s):
101 """s -> a string containing all the characters not present in s""" 102 letters = [] 103 if not(isinstance(s, type(""))): 104 s = str(s) 105 for c in map(chr, range(256)): 106 if c not in s: 107 letters.append(c) 108 return string.join(letters, "")
109 110 # Map from the msre_parse category names into actual characters. 111 # I can do this here since I don't worry about non-ASCII character sets. 112 categories = { 113 "category_word": string.letters + "0123456789_", 114 "category_digit": string.digits, 115 "category_space": "\t\n\v\f\r ", 116 "category_newline": "\n\r", 117 118 "category_not_word": invert(string.letters + "0123456789_"), 119 "category_not_digit": invert(string.digits), 120 "category_not_space": invert("\t\n\v\f\r "), 121 } 122 123 # Convert an 'in' tuple into an Any object 124 # Pass in the negate flag if given.
125 -def convert_in(group_names, name, terms):
126 negate = (terms[0][0] == 'negate') 127 s = "" 128 for c in terms[negate:]: 129 if c[0] == 'literal': 130 s = s + chr(c[1]) 131 elif c[0] == 'range': 132 for i in range(c[1][0], c[1][1]+1): 133 s = s + chr(i) 134 elif c[0] == 'category': 135 s = s + categories[c[1]] 136 else: 137 raise AssertionError("unknown option for 'in': %s" % c[0]) 138 return Expression.Any(s, negate)
139 140 141 # Convert a 'subpattern' tuple into a Group object
142 -def convert_subpattern(group_names, name, (id, terms)):
143 pattern_name = group_names.reverse_name(id) 144 145 # The name in the ?P<group> may contain attr information 146 # serialized in a URL-encoded form; if present, deconvolute. 147 pos = -1 148 attrs = {} 149 if pattern_name is not None: 150 pos = string.find(pattern_name, "?") 151 152 if pos != -1: 153 import cgi 154 qs = pattern_name[pos+1:] 155 if not qs: 156 # cgi.parse_qs doesn't like parsing the empty string 157 attrs = {} 158 else: 159 attrs = cgi.parse_qs(pattern_name[pos+1:], 160 keep_blank_values = 1, 161 strict_parsing = 1) 162 pattern_name = pattern_name[:pos] 163 164 for k, v in attrs.items(): 165 if len(v) != 1: 166 raise AssertionError( 167 "The attribute name %s was found more than once (%d times) in the tag %s" % 168 (repr(k), len(v), repr(pattern_name))) 169 170 attrs[k] = v[0] 171 172 return Expression.Group(pattern_name, convert_list(group_names, terms), 173 attrs)
174 175 # Convert a 'newline' tuple into an AnyEol object
176 -def convert_newline(group_names, name, ignore):
177 assert ignore is None, "what does it mean when field is %s?" % `ignore` 178 return Expression.AnyEol()
179 180 # Convert a 'max_repeat' tuple into a MaxRepeat object
181 -def convert_max_repeat(group_names, name, (min_count, max_count, terms)):
182 return Expression.MaxRepeat(convert_list(group_names, terms), 183 min_count, max_count)
184 185 # Convert a 'groupref' tuple into a GroupRef object
186 -def convert_groupref(group_names, name, id):
187 assert type(id) != type(0), \ 188 "Martel cannot use numbered group reference: %d" % id 189 # msre_parse returns the list from the GroupNames 190 # Map that back to a number. 191 pattern_name = group_names.reverse_name(id[0]) 192 return Expression.GroupRef(pattern_name)
193 194 # Map from the tuple typename into the function used to convert the 195 # tuple into an Expression. 196 197 converter_table = { 198 "any": convert_any, 199 "assert": convert_assert, 200 "assert_not": convert_assert_not, 201 "at": convert_at, 202 "branch": convert_branch, 203 "groupref": convert_groupref, 204 "in": convert_in, 205 "literal": convert_literal, 206 "max_repeat": convert_max_repeat, 207 "newline": convert_newline, 208 "not_literal": convert_not_literal, 209 "subpattern": convert_subpattern, 210 } 211 212 # Convert a list of msre_parse tuples into a Seq
213 -def convert_list(group_names, terms):
214 # This is always a sequence of terms 215 results = [] 216 for term in terms: 217 name = term[0] 218 try: 219 func = converter_table[name] 220 except KeyError: 221 raise AssertionError, "Do not understand sre expression %s" % \ 222 repr(name) 223 224 results.append( func(*(group_names,) + term) ) 225 if len(results) == 1: 226 return results[0] 227 return Expression.Seq(tuple(results))
228 229 230 # Primary entry point
231 -def make_expression(pattern):
232 """pattern -> the Expression tree for the given pattern string""" 233 234 # In the following, the "pattern =" and "x.pattern" are the names 235 # used by msre_parse. They have nothing to do the input pattern. 236 237 # Make the msre_parse tuple tree from the string ... 238 x = msre_parse.parse(str = pattern, pattern = GroupNames()) 239 240 # ... and convert it into an Expression 241 return convert_list(x.pattern, x)
242