0001"""Pull-style interface for ElementTree."""
0002
0003__revision__ = "$Rev$"
0004__date__ = "$Date: 2005-02-16 15:43:38 -0500 (Wed, 16 Feb 2005) $"
0005__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0006__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0007__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0008
0009from __future__ import generators
0010
0011from kid.et import *
0012from kid.util import open_resource, QuickTextReader
0013
0014from xml.parsers import expat
0015
0016
0017import htmlentitydefs
0018default_entity_map = {}
0019default_external_dtd = []
0020for k, v in htmlentitydefs.name2codepoint.items():
0021 default_entity_map[k] = unichr(v)
0022 default_external_dtd.append('<!ENTITY %s "&#%d;">' % (k, v))
0023default_external_dtd = '\n'.join(default_external_dtd)
0024
0025
0026class InvalidStreamState(Exception):
0027 def __init__(self, msg="Invalid stream state."):
0028 Exception.__init__(self, msg)
0029
0030def XML(text, fragment=1, encoding=None):
0031 """Element generator that reads from a string"""
0032 if text.startswith('<?xml ') or text.startswith('<!DOCTYPE '):
0033 fragment = 0
0034 if fragment:
0035 text = '<xml>%s</xml>' % text
0036 if isinstance(text, unicode):
0037 encoding = 'utf-16'
0038 text = text.encode(encoding)
0039 p = Parser(QuickTextReader(text), encoding)
0040 p._sourcetext = text
0041 return ElementStream(_coalesce(p, encoding=encoding)).strip()
0042 else:
0043 if isinstance(text, unicode):
0044 encoding = 'utf-16'
0045 text = text.encode(encoding)
0046 p = Parser(QuickTextReader(text), encoding)
0047 p._sourcetext = text
0048 return ElementStream(_coalesce(p, encoding=encoding))
0049
0050def document(file, encoding=None, filename=None):
0051 if not hasattr(file, 'read'):
0052 if filename is None:
0053 filename = file
0054 file = open_resource(file, 'rb')
0055 else:
0056 if filename is None:
0057 filename = '<string>'
0058 p = Parser(file, encoding)
0059 p._filename = filename
0060 return ElementStream(_coalesce(p, encoding=encoding))
0061
0062class ElementStream(object):
0063
0064 """Provides a pull/streaming interface to ElementTree.
0065
0066 Instances of this class are iterable. Most methods of the class act on
0067 the Element that is currently being visited.
0068
0069 """
0070
0071 def __init__(self, stream, current=None):
0072 """Create an ElementStream.
0073
0074 stream - an iterator that returns ElementStream events.
0075 current - If an Element is provided in this parameter than
0076 it is yielded as the first element in the stream.
0077
0078 """
0079 if hasattr(stream, 'tag') and hasattr(stream, 'attrib'):
0080 stream = self._pull(stream, tail=1)
0081 self.current = None
0082 self._iter = self._track(iter(stream), current)
0083
0084 def __iter__(self):
0085 return self._iter
0086
0087 def expand(self):
0088 """Expand the current item in the stream as an Element."""
0089
0090 current = self.current
0091 if current is None:
0092 current = []
0093 stack = [current]
0094 last = None
0095 for ev, item in self._iter:
0096 if ev == START:
0097 current = item
0098 if len(stack) > 0:
0099 stack[-1].append(current)
0100 last = None
0101 stack.append(current)
0102 elif ev == END:
0103 last = stack.pop()
0104 assert last is item
0105 if len(stack) == 0:
0106 break
0107 elif ev == TEXT:
0108 if last is not None:
0109 last.tail = item
0110 else:
0111 current.text = item
0112 if isinstance(last, list):
0113 return last[0]
0114 else:
0115 return last
0116
0117 def strip(self, levels=1):
0118 depth = self.current is not None and 1 or 0
0119 for (ev, item) in self._iter:
0120 if ev == START:
0121 depth += 1
0122 if depth > levels or (depth == levels and ev not in (START, END)):
0123 yield (ev, item)
0124 if ev == END:
0125 depth -= 1
0126 if depth == 0:
0127 break
0128 elif depth < 0:
0129 raise InvalidStreamState()
0130
0131 def eat(self):
0132 """Eat the current element and all decendant items."""
0133 depth = self.current is not None and 1 or 0
0134 for (ev, item) in self._iter:
0135 if ev == START:
0136 depth += 1
0137 elif ev == END:
0138 depth -= 1
0139 if depth == 0:
0140 break
0141 return self
0142
0143 def _pull(self, elem, tail=0):
0144 orig = elem
0145 elem = Element(orig.tag, dict(orig.attrib))
0146
0147 if elem.tag in (Comment, ProcessingInstruction):
0148 elem.text = orig.text
0149 orig.text = None
0150 yield (START, elem)
0151 if orig.text:
0152 yield (TEXT, orig.text)
0153 for child in orig.getchildren():
0154 for event in self._pull(child, tail=1):
0155 yield event
0156 yield (END, elem)
0157 if tail and orig.tail:
0158 yield (TEXT, orig.tail)
0159
0160 def _track(self, stream, current=None):
0161 if current is not None:
0162 self.current = current
0163 yield (START, current)
0164 for p in stream:
0165 ev, item = p
0166 if ev == START:
0167 self.current = item
0168 elif ev == END:
0169 self.current = None
0170 yield (ev, item)
0171
0172 def ensure(cls, stream, current=None):
0173 if isinstance(stream, cls):
0174 return stream
0175 else:
0176 return cls(stream, current)
0177 ensure = classmethod(ensure)
0178
0179
0180def to_unicode(value, encoding):
0181 if isinstance(value, unicode):
0182 return value
0183
0184 if hasattr(value, '__unicode__'):
0185 return unicode(value)
0186
0187 if not isinstance(value, str):
0188 value = str(value)
0189
0190 return unicode(value, encoding)
0191
0192
0193def _coalesce(stream, encoding, extended=1):
0194 """Coalesces TEXT events and namespace events.
0195
0196 Fold multiple sequential TEXT events into a single event.
0197
0198 The 'encoding' attribute is for the source strings.
0199 """
0200 textbuf = []
0201 namespaces = []
0202 last_ev = None
0203 last = None
0204 current = None
0205 stack = [None]
0206 for ev, item in stream:
0207 if ev == TEXT:
0208 textbuf.append(item)
0209 last_ev = TEXT
0210 continue
0211 if last_ev == TEXT:
0212 text = u""
0213 for value in textbuf:
0214 text += to_unicode(value, encoding)
0215
0216 textbuf = []
0217 if text:
0218 yield (TEXT, text)
0219 if ev == START:
0220 attrib = item.attrib
0221 for prefix, uri in namespaces:
0222 if prefix:
0223 attrib['xmlns:%s' % prefix] = uri
0224 else:
0225 attrib['xmlns'] = uri
0226 namespaces = []
0227 current = item
0228 stack.append(item)
0229 elif ev == END:
0230 current = stack.pop()
0231 elif ev == START_NS:
0232 prefix, uri = item
0233 namespaces.append( (prefix, uri) )
0234 continue
0235 elif ev == END_NS:
0236 continue
0237 yield ev, item
0238 if last_ev == TEXT:
0239 text = u""
0240 for value in textbuf:
0241 text += to_unicode(value, encoding)
0242
0243 if text:
0244 yield (TEXT, text)
0245
0246
0247START = 1
0248END = 2
0249TEXT = 3
0250DOCTYPE = 4
0251XML_DECL = 5
0252
0253
0254START_NS = 10
0255END_NS = 11
0256PI = 12
0257COMMENT = 13
0258
0259def Parser(source, encoding=None):
0260 return ExpatParser(source)
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271class ExpatParser(object):
0272
0273 def __init__(self, source, encoding=None):
0274 if not hasattr(source, 'read'):
0275 filename = source
0276 source = open(source, 'rb')
0277 else:
0278 filename = '<string>'
0279 self._filename = filename
0280 self._source = source
0281 self._parser = parser = expat.ParserCreate(encoding, "}")
0282 self._queue = []
0283
0284
0285 parser.DefaultHandler = self._default
0286 parser.StartElementHandler = self._start
0287 parser.EndElementHandler = self._end
0288 parser.CharacterDataHandler = self._data
0289 parser.ProcessingInstructionHandler = self._pi
0290 parser.CommentHandler = self._comment
0291 parser.StartNamespaceDeclHandler = self._start_ns
0292 parser.EndNamespaceDeclHandler = self._end_ns
0293 parser.XmlDeclHandler = self._xmldecl_handler
0294 parser.StartDoctypeDeclHandler = self._doctype_handler
0295
0296
0297 try:
0298 self._parser.buffer_text = 1
0299 except AttributeError:
0300 pass
0301
0302 try:
0303 self._parser.ordered_attributes = 1
0304 self._parser.specified_attributes = 1
0305 parser.StartElementHandler = self._start_list
0306 except AttributeError:
0307 pass
0308 self._doctype = None
0309
0310 self.entity = default_entity_map
0311 self.external_dtd = default_external_dtd
0312
0313 self._parser.SetParamEntityParsing(
0314 expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
0315 self._parser.ExternalEntityRefHandler = self._buildForeign
0316 self._parser.UseForeignDTD()
0317
0318 def _buildForeign(self, context, base, systemId, publicId):
0319 import StringIO
0320 parseableFile = StringIO.StringIO(default_external_dtd)
0321 original_parser = self._parser
0322 self._parser = self._parser.ExternalEntityParserCreate(context)
0323 self._parser.ParseFile(parseableFile)
0324 self._parser = original_parser
0325 return 1
0326
0327 def push(self, ev, stuff):
0328 self._queue.append( (ev, stuff) )
0329
0330 def _expat_stream(self):
0331 bufsize = 4 * 1024
0332 feed = self.feed
0333 read = self._source.read
0334 done = 0
0335 while 1:
0336 while not done and len(self._queue) == 0:
0337 data = read(bufsize)
0338 if data == '':
0339 self.close()
0340 done = 1
0341 else:
0342 feed(data)
0343 for i in self._queue:
0344 yield i
0345 self._queue = []
0346 if done:
0347 break
0348
0349 def __iter__(self):
0350 names = {}
0351
0352
0353 old_ns = 'http://naeblis.cx/ns/kid#'
0354 new_ns = 'http://purl.org/kid/ns#'
0355 def fixname(key):
0356 if key.startswith(old_ns):
0357 key = ''.join([new_ns, key[len(old_ns):]])
0358 try:
0359 name = names[key]
0360 except KeyError:
0361 name = key
0362 if "}" in name:
0363 name = "{" + name
0364 names[key] = name
0365 return name
0366
0367 stack = []
0368 parent = None
0369 current = None
0370 for (ev, stuff) in self._expat_stream():
0371 if ev == TEXT:
0372 yield (TEXT, stuff)
0373 elif ev == START:
0374 tag, attrib_in = stuff
0375 tag = fixname(tag)
0376 attrib = {}
0377 if attrib_in:
0378 for key, value in attrib_in.items():
0379 attrib[fixname(key)] = value
0380 parent = current
0381 current = Element(tag, attrib)
0382 stack.append(current)
0383 yield (START, current)
0384 elif ev == END:
0385 current = stack.pop()
0386 assert fixname(stuff) == current.tag
0387 parent = len(stack) and stack[-1] or None
0388 yield (END, current)
0389 elif ev == COMMENT:
0390 current = Comment(stuff)
0391 yield (START, current)
0392 yield (END, current)
0393 elif ev == PI:
0394 current = ProcessingInstruction(stuff[0], stuff[1])
0395 yield (START, current)
0396 yield (END, current)
0397 else:
0398 yield (ev, stuff)
0399
0400 def feed(self, data):
0401 try:
0402 self._parser.Parse(data, 0)
0403 except expat.ExpatError, e:
0404 e.filename = self._filename
0405 if hasattr(self, '_sourcetext'):
0406 line = e.lineno
0407 e.source = self._sourcetext.split('\n', line)[-1]
0408 else:
0409 e.source = '???'
0410 raise e
0411
0412 def close(self):
0413 if hasattr(self, '_parser'):
0414 self._parser.Parse('', 1)
0415 del self._parser
0416
0417 def _start(self, tag, attrib_in):
0418 self._queue.append((START, (tag, attrib_in)))
0419
0420 def _start_list(self, tag, attrib_in):
0421 attrib = None
0422 if attrib_in:
0423 attrib = {}
0424 for i in range(0, len(attrib_in), 2):
0425 attrib[attrib_in[i]] = attrib_in[i+1]
0426 self._queue.append((START, (tag, attrib)))
0427
0428 def _data(self, text):
0429 self._queue.append((TEXT, text))
0430
0431 def _end(self, tag):
0432 self._queue.append((END, tag))
0433
0434 def _default(self, text):
0435 prefix = text[:1]
0436 if prefix == "&":
0437
0438 try:
0439 self._queue.append((TEXT, self.entity[text[1:-1]]))
0440 except KeyError:
0441 from xml.parsers import expat
0442 raise expat.error(
0443 "undefined entity %s: line %d, column %d" %
0444 (text, self._parser.ErrorLineNumber,
0445 self._parser.ErrorColumnNumber)
0446 )
0447 else:
0448
0449
0450 pass
0451
0452 def _pi(self, target, data):
0453 self._queue.append((PI, (target, data)))
0454
0455 def _comment(self, text):
0456 self._queue.append((COMMENT, text))
0457
0458 def _start_ns(self, prefix, uri):
0459
0460
0461 if uri == 'http://naeblis.cx/ns/kid#':
0462 newuri = 'http://purl.org/kid/ns#'
0463 from warnings import warn
0464 warn('Document uses old kid namespace [%s] this should be changed'
0465 ' to [%s].' % (uri, newuri))
0466 uri = newuri
0467 self._queue.append((START_NS, (prefix or '', uri)))
0468
0469 def _end_ns(self, prefix):
0470 self._queue.append((END_NS, prefix or ''))
0471
0472 def _xmldecl_handler(self, version, encoding, standalone):
0473 self._queue.append((XML_DECL, (version, encoding, standalone)))
0474
0475 def _doctype_handler(self, name, sysid, pubid, has_internal_subset):
0476 self._queue.append((DOCTYPE, (name, pubid, sysid)))
0477
0478
0479
0480
0481
0482__all__ = ['Element', 'SubElement', 'Comment','ProcessingInstruction',
0483 'ElementStream', 'XML', 'document', 'Parser', 'ExpatParser',
0484 'START', 'END', 'TEXT', 'COMMENT', 'PI', 'XML_DECL', 'DOCTYPE']