Commit e96dcb61 authored by Sam Ruby's avatar Sam Ruby

Resync with html5lib (r491)

parent be5c093b
This diff is collapsed.
......@@ -11,30 +11,25 @@ References:
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
* Build a Treebuilder that produces Python DOM objects:
http://docs.python.org/lib/module-xml.dom.html
* Produce SAX events based on the produced DOM. This is intended not to
support streaming, but rather to support application level compatibility.
* Optional namespace support
* Special case the output of XHTML <script> elements so that the empty
element syntax is never used, even when the src attribute is provided.
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
* Map illegal XML characters to U+FFFD, possibly with additional markup in
the case of XHTML
* Selectively lowercase only XHTML, but not foreign markup
"""
import html5parser
from constants import voidElements
import gettext
_ = gettext.gettext
class XHTMLParser(html5parser.HTMLParser):
""" liberal XMTHML parser """
class XMLParser(html5parser.HTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
self.phases["initial"] = XmlRootPhase(self, self.tree)
def normalizeToken(self, token):
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
......@@ -51,6 +46,35 @@ class XHTMLParser(html5parser.HTMLParser):
token["data"] = {}
token["type"] = "EndTag"
elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
return token
class XHTMLParser(XMLParser):
""" liberal XMTHML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
token = XMLParser.normalizeToken(self, token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token["type"] == "EndTag" and \
token["name"] not in voidElements and \
token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
break
else:
self.tree.insertText('')
return token
class XhmlRootPhase(html5parser.RootElementPhase):
......@@ -60,13 +84,6 @@ class XhmlRootPhase(html5parser.RootElementPhase):
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
class XMLParser(XHTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
XHTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlRootPhase(self, self.tree)
class XmlRootPhase(html5parser.Phase):
""" Prime the Xml parser """
def __getattr__(self, name):
......
This diff is collapsed.
......@@ -33,4 +33,10 @@ the various methods.
import os.path
__path__.append(os.path.dirname(__path__[0]))
import dom, etree, simpletree
import dom
import simpletree
try:
import etree
except:
pass
from constants import scopingElements, tableInsertModeElements
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
......
......@@ -14,6 +14,10 @@ class AttrList:
self.element.setAttribute(name, value)
def items(self):
return self.element.attributes.items()
def keys(self):
return self.element.attributes.keys()
def __getitem__(self, name):
return self.element.getAttribute(name)
class NodeBuilder(_base.Node):
def __init__(self, element):
......
import _base
from constants import voidElements
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
......@@ -13,6 +14,9 @@ class Node(_base.Node):
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
def __repr__(self):
return "<%s %s>" % (self.__class__, self.name)
......@@ -71,18 +75,24 @@ class Document(Node):
def __unicode__(self):
return "#document"
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
def hilite(self, encoding="utf-8"):
result = "<pre>"
for child in self.childNodes:
result += child.hilite()
return result.encode(encoding) + "</pre>"
def printTree(self):
tree = unicode(self)
for child in self.childNodes:
tree += child.printTree(2)
return tree
def toxml(self, encoding="utf=8"):
result = ''
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
class DocumentType(Node):
def __init__(self, name):
Node.__init__(self, name)
......@@ -90,6 +100,11 @@ class DocumentType(Node):
def __unicode__(self):
return "<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
class TextNode(Node):
def __init__(self, value):
Node.__init__(self, None)
......@@ -100,6 +115,8 @@ class TextNode(Node):
def toxml(self):
return escape(self.value)
hilite = toxml
class Element(Node):
def __init__(self, name):
......@@ -109,16 +126,6 @@ class Element(Node):
def __unicode__(self):
return "<%s>" % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
def toxml(self):
result = '<' + self.name
if self.attributes:
......@@ -132,6 +139,29 @@ class Element(Node):
else:
result += '/>'
return result
def hilite(self):
result = '&lt;<code class="markup element-name">%s</code>' % self.name
if self.attributes:
for name, value in self.attributes.iteritems():
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
if self.childNodes:
result += ">"
for child in self.childNodes:
result += child.hilite()
elif self.name in voidElements:
return result + ">"
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
class CommentNode(Node):
def __init__(self, data):
......@@ -140,8 +170,12 @@ class CommentNode(Node):
def __unicode__(self):
return "<!-- %s -->" % self.data
def toxml(self):
return "<!--%s-->" % self.data
toxml = __unicode__
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment