0001"""Infoset serialization formats (XML, XHTML, HTML, etc)"""
0002
0003from __future__ import generators
0004
0005__revision__ = "$Rev$"
0006__date__ = "$Date: 2005-02-16 15:43:38 -0500 (Wed, 16 Feb 2005) $"
0007__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0008__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0009__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0010
0011import re
0012
0013from kid.et import *
0014from kid.pull import *
0015from kid.pull import _coalesce
0016
0017
0018import kid.namespace as namespace
0019
0020__all__ = ['doctypes', 'Serializer', 'XMLSerializer', 'HTMLSerializer']
0021
0022
0023
0024doctypes = {
0025 'html-strict' : ('HTML', '-//W3C//DTD HTML 4.01//EN',
0026 'http://www.w3.org/TR/html4/strict.dtd'),
0027 'html' : ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN',
0028 'http://www.w3.org/TR/html4/loose.dtd'),
0029 'xhtml-strict' : ('html', '-//W3C//DTD XHTML 1.0 Strict//EN',
0030 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'),
0031 'xhtml' : ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
0032 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd') }
0033
0034
0035class Serializer(object):
0036
0037 namespaces = namespace.namespaces
0038 encoding = 'utf-8'
0039 balanced_blocks = 1
0040 strip_whitespace = 0
0041
0042 def __init__(self, encoding=None, src_encoding="utf-8"):
0043 if encoding is not None:
0044 self.encoding = encoding
0045 self.src_encoding = src_encoding
0046
0047 def has_only_pcdata(self, tagname):
0048 return False
0049
0050 def serialize(self, stream, encoding=None, fragment=0):
0051 text = list(self.generate(stream, encoding, fragment))
0052 return ''.join(text)
0053
0054 def write(self, stream, file, encoding=None, fragment=0):
0055 needs_closed = False
0056 if not hasattr(file, 'write'):
0057 needs_closed = True
0058 file = open(file, 'wb')
0059 try:
0060 write = file.write
0061 for text in self.generate(stream, encoding, fragment):
0062 write(text)
0063 finally:
0064
0065 if needs_closed:
0066 file.close()
0067
0068 def generate(self, stream, encoding=None, fragment=0):
0069 pass
0070
0071 def apply_filters(self, stream):
0072 stream = _coalesce(stream, self.src_encoding)
0073 if self.strip_whitespace:
0074 stream = self.whitespace_filter(stream)
0075 else:
0076 if self.balanced_blocks:
0077 stream = self.balancing_filter(stream)
0078 return stream
0079
0080 def balancing_filter(self, stream):
0081 line_collapse = re.compile('\n{2,}')
0082 text = ''
0083 hops = 0
0084 for ev, item in stream:
0085 if ev == TEXT:
0086 text = item
0087 hops = 0
0088 elif ev in (START, END) and item.tag != Fragment:
0089 if hops > 0:
0090 if text and text.strip() == '':
0091 yield (TEXT, line_collapse.sub('\n', text))
0092 elif text:
0093 if text.strip() == '':
0094 yield (TEXT, line_collapse.sub('\n', text))
0095 else:
0096 yield (TEXT, text)
0097 yield (ev, item)
0098 hops+=1
0099
0100
0101 if ev == START and (self.has_only_pcdata(item.tag)
0102 or item.tag == Comment):
0103 text = ''
0104 else:
0105 yield (ev, item)
0106
0107 def whitespace_filter(self, stream):
0108 for ev, item in stream:
0109 if ev == TEXT:
0110 yield (TEXT, item.strip())
0111 else:
0112 yield (ev, item)
0113
0114class XMLSerializer(Serializer):
0115
0116 decl = 1
0117 doctype = None
0118 cdata_elements = []
0119
0120 def __init__(self, encoding=None, decl=None, doctype=None,
0121 namespaces=None):
0122 Serializer.__init__(self, encoding)
0123 if decl is not None:
0124 self.decl = decl
0125 if doctype is not None:
0126 self.doctype = doctype
0127 if isinstance(self.doctype, basestring):
0128
0129 self.doctype = doctypes[self.doctype]
0130 if namespaces:
0131 self.namespaces = namespaces
0132
0133 def can_be_empty_element(self, ns_stack, item_name):
0134 return True
0135
0136 def generate(self, stream, encoding=None, fragment=0):
0137 """Serializes an event stream to bytes of the specified encoding.
0138
0139 This function yields an encoded string over and over until the
0140 stream is exhausted.
0141
0142 """
0143
0144 encoding = encoding or self.encoding or 'utf-8'
0145 escape_cdata = XMLSerializer.escape_cdata
0146 escape_attrib = XMLSerializer.escape_attrib
0147
0148 lastev = None
0149 stream = iter(stream)
0150 names = NamespaceStack(self.namespaces)
0151 if not fragment:
0152 if self.decl:
0153 yield '<?xml version="1.0" encoding="%s"?>\n' % encoding
0154 if self.doctype is not None:
0155 yield serialize_doctype(self.doctype) + '\n'
0156 text = None
0157 for ev, item in self.apply_filters(stream):
0158 if ev in (START, END) and item.tag == Fragment:
0159 continue
0160 elif ev == TEXT:
0161 if text is not None:
0162 text = u''.join([text, item])
0163 else:
0164 text = item
0165 continue
0166 if lastev == START:
0167 if ev == END and (not text or not text.strip()) and self.can_be_empty_element(names, item.tag):
0168 yield ' />'
0169 lastev = END
0170 text = None
0171 names.pop()
0172 continue
0173 yield ">"
0174 if text:
0175 yield escape_cdata(text, encoding)
0176 text = None
0177 if ev == START:
0178 if item.tag == Comment:
0179 yield "<!--%s-->" % item.text.encode(encoding)
0180 lastev = COMMENT
0181 continue
0182 elif item.tag == ProcessingInstruction:
0183 yield "<?%s?>" % item.text.encode(encoding)
0184 lastev = PI
0185 continue
0186 else:
0187 tag = item.tag
0188 names.push(namespaces(item, remove=1))
0189 qname = names.qname(tag, default=1)
0190 yield "<" + qname.encode(encoding)
0191 attrs = item.attrib.items()
0192 if attrs:
0193 for k, v in attrs:
0194 qname = names.qname(k, default=0)
0195 yield ' %s="%s"' % (qname.encode(encoding),
0196 escape_attrib(v, encoding))
0197 for prefix, uri in names.current.items():
0198 if prefix == '':
0199 yield ' xmlns="%s"' % escape_attrib(uri, encoding)
0200 else:
0201 yield ' xmlns:%s="%s"' % (prefix.encode(encoding),
0202 escape_attrib(uri, encoding))
0203 elif ev == END and item.tag not in (Comment, ProcessingInstruction):
0204 qname = names.qname(item.tag, default=1)
0205 yield "</%s>" % qname.encode(encoding)
0206 names.pop()
0207 lastev = ev
0208 return
0209
0210 def escape_cdata(text, encoding=None):
0211 """Escape character data."""
0212 try:
0213 if encoding:
0214 try:
0215 text = text.encode(encoding)
0216 except UnicodeError:
0217 return _encode_entity(text)
0218 text = text.replace("&", "&")
0219 text = text.replace("<", "<")
0220 return text
0221 except (TypeError, AttributeError):
0222 _raise_serialization_error(text)
0223 escape_cdata = staticmethod(escape_cdata)
0224
0225 def escape_attrib(text, encoding=None):
0226 """Escape attribute value."""
0227 try:
0228 if encoding:
0229 try:
0230 text = text.encode(encoding)
0231 except UnicodeError:
0232 return _encode_entity(text)
0233 text = text.replace("&", "&")
0234 text = text.replace("<", "<")
0235 text = text.replace("\"", """)
0236 return text
0237 except (TypeError, AttributeError):
0238 _raise_serialization_error(text)
0239 escape_attrib = staticmethod(escape_attrib)
0240
0241
0242
0243try:
0244 set
0245except NameError:
0246 try:
0247 from sets import Set as set
0248 except ImportError:
0249 def set(seq):
0250 return seq
0251
0252import kid.namespace as namespace
0253xhtml = namespace.xhtml.uri
0254import string
0255
0256class HTMLSerializer(Serializer):
0257
0258 doctype = doctypes['html']
0259 transpose = string.upper
0260 transpose = staticmethod(transpose)
0261 inject_type = 1
0262 empty_elements = set(['area', 'base', 'basefont', 'br', 'col', 'frame',
0263 'hr', 'img', 'input', 'isindex', 'link', 'meta',
0264 'param'])
0265
0266 elements_with_pcdata = set(['option', 'textarea', 'fieldset', 'title'])
0267 noescape_elements = set(['script', 'style'])
0268 boolean_attributes = set(['selected', 'checked', 'compact', 'declare',
0269 'defer', 'disabled', 'ismap', 'multiple', 'nohref',
0270 'noresize', 'noshade', 'nowrap'])
0271
0272 def __init__(self, encoding='utf-8', doctype=None, transpose=None):
0273 Serializer.__init__(self, encoding)
0274 if doctype:
0275 self.doctype = doctype
0276 if isinstance(self.doctype, basestring):
0277
0278 self.doctype = doctypes[self.doctype]
0279 if transpose:
0280 self.transpose = transpose
0281
0282 def has_only_pcdata(self, tagname):
0283 if isinstance(tagname, basestring) and tagname[0] == '{':
0284 tagname = tagname.split('}')[1]
0285 return tagname in self.elements_with_pcdata
0286
0287 def generate(self, stream, encoding=None, fragment=0):
0288 """Serializes an event stream to bytes of the specified encoding.
0289
0290 This function yields an encoded string over and over until the
0291 stream is exhausted.
0292
0293 """
0294
0295 encoding = encoding or self.encoding or 'utf-8'
0296
0297 escape_cdata = HTMLSerializer.escape_cdata
0298 escape_attrib = HTMLSerializer.escape_attrib
0299 noescape_elements = self.noescape_elements
0300 boolean_attributes = self.boolean_attributes
0301 empty_elements = self.empty_elements
0302
0303 names = NamespaceStack(self.namespaces)
0304
0305 def grok_name(tag):
0306 if tag[0] == '{':
0307 uri, localname = tag[1:].split('}', 1)
0308 else:
0309 uri, localname = None, tag
0310 if uri and uri != xhtml:
0311 qname = names.qname(tag, default=0)
0312 else:
0313 qname = localname
0314 if self.transpose is not None:
0315 qname = self.transpose(qname)
0316 return (uri, localname, qname)
0317
0318 if self.transpose:
0319 attr_http_equiv = self.transpose('http-equiv')
0320 attr_content = self.transpose('content')
0321
0322 current = None
0323 stack = [current]
0324 stream = iter(stream)
0325 if not fragment and self.doctype is not None:
0326 yield serialize_doctype(self.doctype) + '\n'
0327 for ev, item in self.apply_filters(stream):
0328 if ev == TEXT and item:
0329 escape = current not in noescape_elements
0330 yield escape_cdata(item, encoding, escape)
0331 elif ev == START:
0332 if item.tag == Comment:
0333 yield "<!--%s-->" % item.text.encode(encoding)
0334 lastev = COMMENT
0335 continue
0336 elif item.tag == ProcessingInstruction:
0337 yield "<?%s>" % item.text.encode(encoding)
0338 lastev = PI
0339 continue
0340 elif item.tag == Fragment:
0341 continue
0342 else:
0343 names.push(namespaces(item, remove=1))
0344 tag = item.tag
0345 (uri, localname, qname) = grok_name(tag)
0346
0347
0348 current = qname.lower()
0349 stack.append(current)
0350
0351 yield "<" + qname.encode(encoding)
0352 attrs = item.attrib.items()
0353 if attrs:
0354 for k, v in attrs:
0355 (u, l, q) = grok_name(k)
0356 lq = q.lower()
0357 if lq == 'xml:lang': continue
0358 if lq in boolean_attributes:
0359
0360
0361 yield ' %s' % q.encode(encoding)
0362 else:
0363 yield ' %s="%s"' % (q.encode(encoding),
0364 escape_attrib(v, encoding))
0365 yield ">"
0366 if self.inject_type:
0367 if current == 'head':
0368 (uri, localname, qname) = grok_name("meta")
0369 yield '<%s %s="text/html; charset=%s"' ' %s="Content-Type">' % (qname.encode(encoding),
0372 attr_content,
0373 encoding,
0374 attr_http_equiv)
0375
0376 elif ev == END and item.tag not in (Comment,
0377 ProcessingInstruction,
0378 Fragment):
0379 current = stack.pop()
0380 if current not in empty_elements:
0381 tag = item.tag
0382 (uri, localname, qname) = grok_name(tag)
0383 yield "</%s>" % qname.encode(encoding)
0384 current = stack[-1]
0385 names.pop()
0386 return
0387
0388 def escape_cdata(text, encoding=None, escape=1):
0389 """Escape character data."""
0390 try:
0391 if encoding:
0392 try:
0393 text = text.encode(encoding)
0394 except UnicodeError:
0395 return _encode_entity(text)
0396 if escape:
0397 text = text.replace("&", "&")
0398 text = text.replace("<", "<")
0399 return text
0400 except (TypeError, AttributeError):
0401 _raise_serialization_error(text)
0402 escape_cdata = staticmethod(escape_cdata)
0403
0404 def escape_attrib(text, encoding=None):
0405 """Escape attribute value."""
0406 try:
0407 if encoding:
0408 try:
0409 text = text.encode(encoding)
0410 except UnicodeError:
0411 return _encode_entity(text)
0412 text = text.replace("&", "&")
0413 text = text.replace("\"", """)
0414 return text
0415 except (TypeError, AttributeError):
0416 _raise_serialization_error(text)
0417 escape_attrib = staticmethod(escape_attrib)
0418
0419class XHTMLSerializer(XMLSerializer):
0420 empty_elements = [namespace.xhtml.clarkname(name) for name in HTMLSerializer.empty_elements]
0421 elements_with_pcdata = [namespace.xhtml.clarkname(name) for name in HTMLSerializer.elements_with_pcdata]
0422
0423 def can_be_empty_element(self, ns_stack, tagname):
0424 return tagname in self.empty_elements
0425
0426 def has_only_pcdata(self, tagname):
0427 return tagname in self.elements_with_pcdata
0428
0429class PlainSerializer(Serializer):
0430
0431 def generate(self, stream, encoding=None, fragment=0):
0432
0433 encoding = encoding or self.encoding or 'utf-8'
0434 for ev, item in self.apply_filters(stream):
0435 if ev == TEXT:
0436 yield item
0437
0438
0439class NamespaceStack:
0440
0441 """Maintains a stack of namespace prefix to URI mappings."""
0442
0443 def __init__(self, default_map=namespace.namespaces):
0444 self.stack = []
0445 self.default_map = default_map
0446 self.push()
0447 self.ns_count = 0
0448
0449 def push(self, names=None):
0450 if names is None:
0451 names = {}
0452 self.current = names
0453 self.stack.insert(0, self.current)
0454
0455 def pop(self):
0456 del self.stack[0]
0457 if len(self.stack):
0458 self.current = self.stack[0]
0459
0460 def resolve_prefix(self, uri, default=1):
0461 """Figure out prefix given a URI."""
0462
0463 if uri == 'http://www.w3.org/XML/1998/namespace':
0464 return 'xml'
0465
0466 is_default = -1
0467 prefix = None
0468 for names in self.stack:
0469 for k, v in names.items():
0470 if default and is_default == -1 and k == '':
0471
0472 is_default = (v == uri)
0473 if (default and is_default) or prefix:
0474 break
0475 if v == uri and k != '':
0476 prefix = k
0477 if is_default > -1:
0478 break
0479 if default and is_default == 1:
0480 return ''
0481 elif prefix:
0482 return prefix
0483 else:
0484 return None
0485
0486 def resolve_uri(self, prefix):
0487 """Figure out URI given a prefix."""
0488
0489 if prefix == 'xml':
0490 return 'http://www.w3.org/XML/1998/namespace'
0491 for names in self.stack:
0492 uri = names.get(prefix)
0493 if uri:
0494 return uri
0495 return None
0496
0497 def qname(self, cname, default=0):
0498 if isinstance(cname, QName):
0499 cname = cname.text
0500 if cname[0] != '{':
0501
0502 return cname
0503 uri, localname = cname[1:].split('}', 1)
0504 prefix = self.resolve_prefix(uri, default)
0505 if prefix is None:
0506
0507 prefix = self.default_map.get(uri)
0508 if prefix is not None:
0509 self.current[prefix] = uri
0510 else:
0511 if default and not self.current.has_key(''):
0512 prefix = ''
0513 self.current[prefix] = uri
0514 else:
0515 self.ns_count += 1
0516
0517 prefix = 'ns%d' % self.ns_count
0518 self.current[prefix] = uri
0519 if prefix != '':
0520 return '%s:%s' % (prefix, localname)
0521 else:
0522 return localname
0523
0524 def set(self, prefix, uri):
0525 if prefix is None:
0526 prefix = ''
0527 self.current[prefix] = uri
0528
0529
0530
0531
0532from kid.et import ET
0533_encode_entity = ET._encode_entity
0534_raise_serialization_error = ET._raise_serialization_error
0535
0536def serialize_doctype(doctype):
0537 return '<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype