Source code for html5lib.html5parser

from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass, viewkeys

import types

from . import _inputstream
from . import _tokenizer

from . import treebuilders
from .treebuilders.base import Marker

from . import _utils
from .constants import (
    spaceCharacters, asciiUpper2Lower,
    specialElements, headingElements, cdataElements, rcdataElements,
    tokenTypes, tagTokenTypes,
    namespaces,
    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
    adjustForeignAttributes as adjustForeignAttributesMap,
    adjustMathMLAttributes, adjustSVGAttributes,
    E,
    _ReparseException
)


def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
    """Parse an HTML document as a string or file-like object into a tree

    :arg doc: the document to parse as a string or file-like object

    :arg treebuilder: the treebuilder to use when parsing

    :arg namespaceHTMLElements: whether or not to namespace HTML elements

    :returns: parsed tree

    Example:

    >>> from html5lib.html5parser import parse
    >>> parse('<html><body><p>This is a doc</p></body></html>')
    <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>

    """
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
    return p.parse(doc, **kwargs)


def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
    """Parse an HTML fragment as a string or file-like object into a tree

    :arg doc: the fragment to parse as a string or file-like object

    :arg container: the container context to parse the fragment in

    :arg treebuilder: the treebuilder to use when parsing

    :arg namespaceHTMLElements: whether or not to namespace HTML elements

    :returns: parsed tree

    Example:

    >>> from html5lib.html5libparser import parseFragment
    >>> parseFragment('<b>this is a fragment</b>')
    <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>

    """
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
    return p.parseFragment(doc, container=container, **kwargs)


def method_decorator_metaclass(function):
    class Decorated(type):
        def __new__(meta, classname, bases, classDict):
            for attributeName, attribute in classDict.items():
                if isinstance(attribute, types.FunctionType):
                    attribute = function(attribute)

                classDict[attributeName] = attribute
            return type.__new__(meta, classname, bases, classDict)
    return Decorated


class HTMLParser(object):
    """HTML parser

    Generates a tree structure from a stream of (possibly malformed) HTML.

    """

    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
        """
        :arg tree: a treebuilder class controlling the type of tree that will be
            returned. Built in treebuilders can be accessed through
            html5lib.treebuilders.getTreeBuilder(treeType)

        :arg strict: raise an exception when a parse error is encountered

        :arg namespaceHTMLElements: whether or not to namespace HTML elements

        :arg debug: whether or not to enable debug mode which logs things

        Example:

        >>> from html5lib.html5parser import HTMLParser
        >>> parser = HTMLParser()                     # generates parser with etree builder
        >>> parser = HTMLParser('lxml', strict=True)  # generates parser with lxml builder which is strict

        """

        # Raise an exception on the first error encountered
        self.strict = strict

        if tree is None:
            tree = treebuilders.getTreeBuilder("etree")
        self.tree = tree(namespaceHTMLElements)
        self.errors = []

        self.phases = {name: cls(self, self.tree) for name, cls in
                       getPhases(debug).items()}

[docs] def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): self.innerHTMLMode = innerHTML self.container = container self.scripting = scripting self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) self.reset() try: self.mainLoop() except _ReparseException: self.reset() self.mainLoop()
[docs] def reset(self): self.tree.reset() self.firstStartTag = False self.errors = [] self.log = [] # only used with debug mode # "quirks" / "limited quirks" / "no quirks" self.compatMode = "no quirks" if self.innerHTMLMode: self.innerHTML = self.container.lower() if self.innerHTML in cdataElements: self.tokenizer.state = self.tokenizer.rcdataState elif self.innerHTML in rcdataElements: self.tokenizer.state = self.tokenizer.rawtextState elif self.innerHTML == 'plaintext': self.tokenizer.state = self.tokenizer.plaintextState else: # state already is data state # self.tokenizer.state = self.tokenizer.dataState pass self.phase = self.phases["beforeHtml"] self.phase.insertHtmlElement() self.resetInsertionMode() else: self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None self.beforeRCDataPhase = None self.framesetOK = True
@property def documentEncoding(self): """Name of the character encoding that was used to decode the input stream, or :obj:`None` if that is not determined yet """ if not hasattr(self, 'tokenizer'): return None return self.tokenizer.stream.charEncoding[0].name
[docs] def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): return ("encoding" in element.attributes and element.attributes["encoding"].translate( asciiUpper2Lower) in ("text/html", "application/xhtml+xml")) else: return (element.namespace, element.name) in htmlIntegrationPointElements
[docs] def isMathMLTextIntegrationPoint(self, element): return (element.namespace, element.name) in mathmlTextIntegrationPointElements
[docs] def mainLoop(self): CharactersToken = tokenTypes["Characters"] SpaceCharactersToken = tokenTypes["SpaceCharacters"] StartTagToken = tokenTypes["StartTag"] EndTagToken = tokenTypes["EndTag"] CommentToken = tokenTypes["Comment"] DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] for token in self.tokenizer: prev_token = None new_token = token while new_token is not None: prev_token = new_token currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None type = new_token["type"] if type == ParseErrorToken: self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: if (len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and ((type == StartTagToken and token["name"] not in frozenset(["mglyph", "malignmark"])) or type in (CharactersToken, SpaceCharactersToken))) or (currentNodeNamespace == namespaces["mathml"] and currentNodeName == "annotation-xml" and type == StartTagToken and token["name"] == "svg") or (self.isHTMLIntegrationPoint(currentNode) and type in (StartTagToken, CharactersToken, SpaceCharactersToken))): phase = self.phase else: phase = self.phases["inForeignContent"] if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: new_token = phase.processSpaceCharacters(new_token) elif type == StartTagToken: new_token = phase.processStartTag(new_token) elif type == EndTagToken: new_token = phase.processEndTag(new_token) elif type == CommentToken: new_token = phase.processComment(new_token) elif type == DoctypeToken: new_token = phase.processDoctype(new_token) if (type == StartTagToken and prev_token["selfClosing"] and not prev_token["selfClosingAcknowledged"]): self.parseError("non-void-element-with-trailing-solidus", {"name": prev_token["name"]}) # When the loop finishes it's EOF reprocess = True phases = [] while reprocess: phases.append(self.phase) reprocess = self.phase.processEOF() if reprocess: assert self.phase not in phases
[docs] def parse(self, stream, *args, **kwargs): """Parse a HTML document into a well-formed tree :arg stream: a file-like object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element). :arg scripting: treat noscript elements as if JavaScript was turned on :returns: parsed tree Example: >>> from html5lib.html5parser import HTMLParser >>> parser = HTMLParser() >>> parser.parse('<html><body><p>This is a doc</p></body></html>') <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> """ self._parse(stream, False, None, *args, **kwargs) return self.tree.getDocument()
[docs] def parseFragment(self, stream, *args, **kwargs): """Parse a HTML fragment into a well-formed tree fragment :arg container: name of the element we're setting the innerHTML property if set to None, default to 'div' :arg stream: a file-like object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) :arg scripting: treat noscript elements as if JavaScript was turned on :returns: parsed tree Example: >>> from html5lib.html5libparser import HTMLParser >>> parser = HTMLParser() >>> parser.parseFragment('<b>this is a fragment</b>') <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> """ self._parse(stream, True, *args, **kwargs) return self.tree.getFragment()
[docs] def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. if datavars is None: datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars)
[docs] def adjustMathMLAttributes(self, token): adjust_attributes(token, adjustMathMLAttributes)
[docs] def adjustSVGAttributes(self, token): adjust_attributes(token, adjustSVGAttributes)
[docs] def adjustForeignAttributes(self, token): adjust_attributes(token, adjustForeignAttributesMap)
[docs] def reparseTokenNormal(self, token): # pylint:disable=unused-argument self.parser.phase()
[docs] def resetInsertionMode(self): # The name of this method is mostly historical. (It's also used in the # specification.) last = False newModes = { "select": "inSelect", "td": "inCell", "th": "inCell", "tr": "inRow", "tbody": "inTableBody", "thead": "inTableBody", "tfoot": "inTableBody", "caption": "inCaption", "colgroup": "inColumnGroup", "table": "inTable", "head": "inBody", "body": "inBody", "frameset": "inFrameset", "html": "beforeHead" } for node in self.tree.openElements[::-1]: nodeName = node.name new_phase = None if node == self.tree.openElements[0]: assert self.innerHTML last = True nodeName = self.innerHTML # Check for conditions that should only happen in the innerHTML # case if nodeName in ("select", "colgroup", "head", "html"): assert self.innerHTML if not last and node.namespace != self.tree.defaultNamespace: continue if nodeName in newModes: new_phase = self.phases[newModes[nodeName]] break elif last: new_phase = self.phases["inBody"] break self.phase = new_phase
[docs] def parseRCDataRawtext(self, token, contentType): # Generic RCDATA/RAWTEXT Parsing algorithm assert contentType in ("RAWTEXT", "RCDATA") self.tree.insertElement(token) if contentType == "RAWTEXT": self.tokenizer.state = self.tokenizer.rawtextState else: self.tokenizer.state = self.tokenizer.rcdataState self.originalPhase = self.phase self.phase = self.phases["text"]
@_utils.memoize def getPhases(debug): def log(function): """Logger that records which phase processes each token""" type_names = {value: key for key, value in tokenTypes.items()} def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: token = args[0] info = {"type": type_names[token['type']]} if token['type'] in tagTokenTypes: info["name"] = token['name'] self.parser.log.append((self.parser.tokenizer.state.__name__, self.parser.phase.__class__.__name__, self.__class__.__name__, function.__name__, info)) return function(self, *args, **kwargs) else: return function(self, *args, **kwargs) return wrapped def getMetaclass(use_metaclass, metaclass_func): if use_metaclass: return method_decorator_metaclass(metaclass_func) else: return type # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") def __init__(self, parser, tree): self.parser = parser self.tree = tree self.__startTagCache = {} self.__endTagCache = {} def processEOF(self): raise NotImplementedError def processComment(self, token): # For most phases the following is correct. Where it's not it will be # overridden. self.tree.insertComment(token, self.tree.openElements[-1]) def processDoctype(self, token): self.parser.parseError("unexpected-doctype") def processCharacters(self, token): self.tree.insertText(token["data"]) def processSpaceCharacters(self, token): self.tree.insertText(token["data"]) def processStartTag(self, token): # Note the caching is done here rather than BoundMethodDispatcher as doing it there # requires a circular reference to the Phase, and this ends up with a significant # (CPython 2.7, 3.8) GC cost when parsing many short inputs name = token["name"] # In Py2, using `in` is quicker in general than try/except KeyError # In Py3, `in` is quicker when there are few cache hits (typically short inputs) if name in self.__startTagCache: func = self.__startTagCache[name] else: func = self.__startTagCache[name] = self.startTagHandler[name] # bound the cache size in case we get loads of unknown tags while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 self.__startTagCache.pop(next(iter(self.__startTagCache))) return func(token) def startTagHtml(self, token): if not self.parser.firstStartTag and token["name"] == "html": self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in token["data"].items(): if attr not in self.tree.openElements[0].attributes: self.tree.openElements[0].attributes[attr] = value self.parser.firstStartTag = False def processEndTag(self, token): # Note the caching is done here rather than BoundMethodDispatcher as doing it there # requires a circular reference to the Phase, and this ends up with a significant # (CPython 2.7, 3.8) GC cost when parsing many short inputs name = token["name"] # In Py2, using `in` is quicker in general than try/except KeyError # In Py3, `in` is quicker when there are few cache hits (typically short inputs) if name in self.__endTagCache: func = self.__endTagCache[name] else: func = self.__endTagCache[name] = self.endTagHandler[name] # bound the cache size in case we get loads of unknown tags while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 self.__endTagCache.pop(next(iter(self.__endTagCache))) return func(token) class InitialPhase(Phase): __slots__ = tuple() def processSpaceCharacters(self, token): pass def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] correct = token["correct"] if (name != "html" or publicId is not None or systemId is not None and systemId != "about:legacy-compat"): self.parser.parseError("unknown-doctype") if publicId is None: publicId = "" self.tree.insertDoctype(token) if publicId != "": publicId = publicId.translate(asciiUpper2Lower) if (not correct or token["name"] != "html" or publicId.startswith( ("+//silmaril//dtd html pro v0r11 19970101//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", "-//as//dtd html 3.0 aswedit + extensions//", "-//ietf//dtd html 2.0 level 1//", "-//ietf//dtd html 2.0 level 2//", "-//ietf//dtd html 2.0 strict level 1//", "-//ietf//dtd html 2.0 strict level 2//", "-//ietf//dtd html 2.0 strict//", "-//ietf//dtd html 2.0//", "-//ietf//dtd html 2.1e//", "-//ietf//dtd html 3.0//", "-//ietf//dtd html 3.2 final//", "-//ietf//dtd html 3.2//", "-//ietf//dtd html 3//", "-//ietf//dtd html level 0//", "-//ietf//dtd html level 1//", "-//ietf//dtd html level 2//", "-//ietf//dtd html level 3//", "-//ietf//dtd html strict level 0//", "-//ietf//dtd html strict level 1//", "-//ietf//dtd html strict level 2//", "-//ietf//dtd html strict level 3//", "-//ietf//dtd html strict//", "-//ietf//dtd html//", "-//metrius//dtd metrius presentational//", "-//microsoft//dtd internet explorer 2.0 html strict//", "-//microsoft//dtd internet explorer 2.0 html//", "-//microsoft//dtd internet explorer 2.0 tables//", "-//microsoft//dtd internet explorer 3.0 html strict//", "-//microsoft//dtd internet explorer 3.0 html//", "-//microsoft//dtd internet explorer 3.0 tables//", "-//netscape comm. corp.//dtd html//", "-//netscape comm. corp.//dtd strict html//", "-//o'reilly and associates//dtd html 2.0//", "-//o'reilly and associates//dtd html extended 1.0//", "-//o'reilly and associates//dtd html extended relaxed 1.0//", "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", "-//spyglass//dtd html 2.0 extended//", "-//sq//dtd html 2.0 hotmetal + extensions//", "-//sun microsystems corp.//dtd hotjava html//", "-//sun microsystems corp.//dtd hotjava strict html//", "-//w3c//dtd html 3 1995-03-24//", "-//w3c//dtd html 3.2 draft//", "-//w3c//dtd html 3.2 final//", "-//w3c//dtd html 3.2//", "-//w3c//dtd html 3.2s draft//", "-//w3c//dtd html 4.0 frameset//", "-//w3c//dtd html 4.0 transitional//", "-//w3c//dtd html experimental 19960712//", "-//w3c//dtd html experimental 970421//", "-//w3c//dtd w3 html//", "-//w3o//dtd w3 html 3.0//", "-//webtechs//dtd mozilla html 2.0//", "-//webtechs//dtd mozilla html//")) or publicId in ("-//w3o//dtd w3 html strict 3.0//en//", "-/w3c/dtd html 4.0 transitional/en", "html") or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and systemId is None or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" elif (publicId.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", "-//w3c//dtd xhtml 1.0 transitional//")) or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and systemId is not None): self.parser.compatMode = "limited quirks" self.parser.phase = self.parser.phases["beforeHtml"] def anythingElse(self): self.parser.compatMode = "quirks" self.parser.phase = self.parser.phases["beforeHtml"] def processCharacters(self, token): self.parser.parseError("expected-doctype-but-got-chars") self.anythingElse() return token def processStartTag(self, token): self.parser.parseError("expected-doctype-but-got-start-tag", {"name": token["name"]}) self.anythingElse() return token def processEndTag(self, token): self.parser.parseError("expected-doctype-but-got-end-tag", {"name": token["name"]}) self.anythingElse() return token def processEOF(self): self.parser.parseError("expected-doctype-but-got-eof") self.anythingElse() return True class BeforeHtmlPhase(Phase): __slots__ = tuple() # helper methods def insertHtmlElement(self): self.tree.insertRoot(impliedTagToken("html", "StartTag")) self.parser.phase = self.parser.phases["beforeHead"] # other def processEOF(self): self.insertHtmlElement() return True def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processSpaceCharacters(self, token): pass def processCharacters(self, token): self.insertHtmlElement() return token def processStartTag(self, token): if token["name"] == "html": self.parser.firstStartTag = True self.insertHtmlElement() return token def processEndTag(self, token): if token["name"] not in ("head", "body", "html", "br"): self.parser.parseError("unexpected-end-tag-before-html", {"name": token["name"]}) else: self.insertHtmlElement() return token class BeforeHeadPhase(Phase): __slots__ = tuple() def processEOF(self): self.startTagHead(impliedTagToken("head", "StartTag")) return True def processSpaceCharacters(self, token): pass def processCharacters(self, token): self.startTagHead(impliedTagToken("head", "StartTag")) return token def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagHead(self, token): self.tree.insertElement(token) self.tree.headPointer = self.tree.openElements[-1] self.parser.phase = self.parser.phases["inHead"] def startTagOther(self, token): self.startTagHead(impliedTagToken("head", "StartTag")) return token def endTagImplyHead(self, token): self.startTagHead(impliedTagToken("head", "StartTag")) return token def endTagOther(self, token): self.parser.parseError("end-tag-after-implied-root", {"name": token["name"]}) startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml), ("head", startTagHead) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ (("head", "body", "html", "br"), endTagImplyHead) ]) endTagHandler.default = endTagOther class InHeadPhase(Phase): __slots__ = tuple() # the real thing def processEOF(self): self.anythingElse() return True def processCharacters(self, token): self.anythingElse() return token def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagHead(self, token): self.parser.parseError("two-heads-are-not-better-than-one") def startTagBaseLinkCommand(self, token): self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def startTagMeta(self, token): self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True attributes = token["data"] if self.parser.tokenizer.stream.charEncoding[1] == "tentative": if "charset" in attributes: self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) elif ("content" in attributes and "http-equiv" in attributes and attributes["http-equiv"].lower() == "content-type"): # Encoding it as UTF-8 here is a hack, as really we should pass # the abstract Unicode string, and just use the # ContentAttrParser on that, but using UTF-8 allows all chars # to be encoded and as a ASCII-superset works. data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) parser = _inputstream.ContentAttrParser(data) codec = parser.parse() self.parser.tokenizer.stream.changeEncoding(codec) def startTagTitle(self, token): self.parser.parseRCDataRawtext(token, "RCDATA") def startTagNoFramesStyle(self, token): # Need to decide whether to implement the scripting-disabled case self.parser.parseRCDataRawtext(token, "RAWTEXT") def startTagNoscript(self, token): if self.parser.scripting: self.parser.parseRCDataRawtext(token, "RAWTEXT") else: self.tree.insertElement(token) self.parser.phase = self.parser.phases["inHeadNoscript"] def startTagScript(self, token): self.tree.insertElement(token) self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState self.parser.originalPhase = self.parser.phase self.parser.phase = self.parser.phases["text"] def startTagOther(self, token): self.anythingElse() return token def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s" % node.name self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): self.anythingElse() return token def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): self.endTagHead(impliedTagToken("head")) startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml), ("title", startTagTitle), (("noframes", "style"), startTagNoFramesStyle), ("noscript", startTagNoscript), ("script", startTagScript), (("base", "basefont", "bgsound", "command", "link"), startTagBaseLinkCommand), ("meta", startTagMeta), ("head", startTagHead) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("head", endTagHead), (("br", "html", "body"), endTagHtmlBodyBr) ]) endTagHandler.default = endTagOther class InHeadNoscriptPhase(Phase): __slots__ = tuple() def processEOF(self): self.parser.parseError("eof-in-head-noscript") self.anythingElse() return True def processComment(self, token): return self.parser.phases["inHead"].processComment(token) def processCharacters(self, token): self.parser.parseError("char-in-head-noscript") self.anythingElse() return token def processSpaceCharacters(self, token): return self.parser.phases["inHead"].processSpaceCharacters(token) def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagBaseLinkCommand(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagHeadNoscript(self, token): self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) def startTagOther(self, token): self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) self.anythingElse() return token def endTagNoscript(self, token): node = self.parser.tree.openElements.pop() assert node.name == "noscript", "Expected noscript got %s" % node.name self.parser.phase = self.parser.phases["inHead"] def endTagBr(self, token): self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) self.anythingElse() return token def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): # Caller must raise parse error first! self.endTagNoscript(impliedTagToken("noscript")) startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml), (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), (("head", "noscript"), startTagHeadNoscript), ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("noscript", endTagNoscript), ("br", endTagBr), ]) endTagHandler.default = endTagOther class AfterHeadPhase(Phase): __slots__ = tuple() def processEOF(self): self.anythingElse() return True def processCharacters(self, token): self.anythingElse() return token def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagBody(self, token): self.parser.framesetOK = False self.tree.insertElement(token) self.parser.phase = self.parser.phases["inBody"] def startTagFrameset(self, token): self.tree.insertElement(token) self.parser.phase = self.parser.phases["inFrameset"] def startTagFromHead(self, token): self.parser.parseError("unexpected-start-tag-out-of-my-head", {"name": token["name"]}) self.tree.openElements.append(self.tree.headPointer) self.parser.phases["inHead"].processStartTag(token) for node in self.tree.openElements[::-1]: if node.name == "head": self.tree.openElements.remove(node) break def startTagHead(self, token): self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) def startTagOther(self, token): self.anythingElse() return token def endTagHtmlBodyBr(self, token): self.anythingElse() return token def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml), ("body", startTagBody), ("frameset", startTagFrameset), (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title"), startTagFromHead), ("head", startTagHead) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), endTagHtmlBodyBr)]) endTagHandler.default = endTagOther class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # the really-really-really-very crazy mode __slots__ = ("processSpaceCharacters",) def __init__(self, *args, **kwargs): super(InBodyPhase, self).__init__(*args, **kwargs) # Set this to the default handler self.processSpaceCharacters = self.processSpaceCharactersNonPre def isMatchingFormattingElement(self, node1, node2): return (node1.name == node2.name and node1.namespace == node2.namespace and node1.attributes == node2.attributes) # helper def addFormattingElement(self, token): self.tree.insertElement(token) element = self.tree.openElements[-1] matchingElements = [] for node in self.tree.activeFormattingElements[::-1]: if node is Marker: break elif self.isMatchingFormattingElement(node, element): matchingElements.append(node) assert len(matchingElements) <= 3 if len(matchingElements) == 3: self.tree.activeFormattingElements.remove(matchingElements[-1]) self.tree.activeFormattingElements.append(element) # the real deal def processEOF(self): allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", "tfoot", "th", "thead", "tr", "body", "html")) for node in self.tree.openElements[::-1]: if node.name not in allowed_elements: self.parser.parseError("expected-closing-tag-but-got-eof") break # Stop parsing def processSpaceCharactersDropNewline(self, token): # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we # want to drop leading newlines data = token["data"] self.processSpaceCharacters = self.processSpaceCharactersNonPre if (data.startswith("\n") and self.tree.openElements[-1].name in ("pre", "listing", "textarea") and not self.tree.openElements[-1].hasContent()): data = data[1:] if data: self.tree.reconstructActiveFormattingElements() self.tree.insertText(data) def processCharacters(self, token): if token["data"] == "\u0000": # The tokenizer should always emit null on its own return self.tree.reconstructActiveFormattingElements() self.tree.insertText(token["data"]) # This must be bad for performance if (self.parser.framesetOK and any([char not in spaceCharacters for char in token["data"]])): self.parser.framesetOK = False def processSpaceCharactersNonPre(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertText(token["data"]) def startTagProcessInHead(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagBody(self, token): self.parser.parseError("unexpected-start-tag", {"name": "body"}) if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: self.parser.framesetOK = False for attr, value in token["data"].items(): if attr not in self.tree.openElements[1].attributes: self.tree.openElements[1].attributes[attr] = value def startTagFrameset(self, token): self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML elif not self.parser.framesetOK: pass else: if self.tree.openElements[1].parent: self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) while self.tree.openElements[-1].name != "html": self.tree.openElements.pop() self.tree.insertElement(token) self.parser.phase = self.parser.phases["inFrameset"] def startTagCloseP(self, token): if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) self.tree.insertElement(token) def startTagPreListing(self, token): if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) self.tree.insertElement(token) self.parser.framesetOK = False self.processSpaceCharacters = self.processSpaceCharactersDropNewline def startTagForm(self, token): if self.tree.formPointer: self.parser.parseError("unexpected-start-tag", {"name": "form"}) else: if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) self.tree.insertElement(token) self.tree.formPointer = self.tree.openElements[-1] def startTagListItem(self, token): self.parser.framesetOK = False stopNamesMap = {"li": ["li"], "dt": ["dt", "dd"], "dd": ["dt", "dd"]} stopNames = stopNamesMap[token["name"]] for node in reversed(self.tree.openElements): if node.name in stopNames: self.parser.phase.processEndTag( impliedTagToken(node.name, "EndTag")) break if (node.nameTuple in specialElements and node.name not in ("address", "div", "p")): break if self.tree.elementInScope("p", variant="button"): self.parser.phase.processEndTag( impliedTagToken("p", "EndTag")) self.tree.insertElement(token) def startTagPlaintext(self, token): if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) self.tree.insertElement(token) self.parser.tokenizer.state = self.parser.tokenizer.plaintextState def startTagHeading(self, token): if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) if self.tree.openElements[-1].name in headingElements: self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) self.tree.openElements.pop() self.tree.insertElement(token) def startTagA(self, token): afeAElement = self.tree.elementInActiveFormattingElements("a") if afeAElement: self.parser.parseError("unexpected-start-tag-implies-end-tag", {"startName": "a", "endName": "a"}) self.endTagFormatting(impliedTagToken("a")) if afeAElement in self.tree.openElements: self.tree.openElements.remove(afeAElement) if afeAElement in self.tree.activeFormattingElements: self.tree.activeFormattingElements.remove(afeAElement) self.tree.reconstructActiveFormattingElements() self.addFormattingElement(token) def startTagFormatting(self, token): self.tree.reconstructActiveFormattingElements() self.addFormattingElement(token) def startTagNobr(self, token): self.tree.reconstructActiveFormattingElements() if self.tree.elementInScope("nobr"): self.parser.parseError("unexpected-start-tag-implies-end-tag", {"startName": "nobr", "endName": "nobr"}) self.processEndTag(impliedTagToken("nobr")) # XXX Need tests that trigger the following self.tree.reconstructActiveFormattingElements() self.addFormattingElement(token) def startTagButton(self, token): if self.tree.elementInScope("button"): self.parser.parseError("unexpected-start-tag-implies-end-tag", {"startName": "button", "endName": "button"}) self.processEndTag(impliedTagToken("button")) return token else: self.tree.reconstructActiveFormattingElements() self.tree.insertElement(token) self.parser.framesetOK = False def startTagAppletMarqueeObject(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) self.parser.framesetOK = False def startTagXmp(self, token): if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) self.tree.reconstructActiveFormattingElements() self.parser.framesetOK = False self.parser.parseRCDataRawtext(token, "RAWTEXT") def startTagTable(self, token): if self.parser.compatMode != "quirks": if self.tree.elementInScope("p", variant="button"): self.processEndTag(impliedTagToken("p")) self.tree.insertElement(token) self.parser.framesetOK = False self.parser.phase = self.parser.phases["inTable"] def startTagVoidFormatting(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True self.parser.framesetOK = False def startTagInput(self, token): framesetOK = self.parser.framesetOK self.startTagVoidFormatting(token) if ("type" in token["data"] and token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): # input type=hidden doesn't change framesetOK self.parser.framesetOK = framesetOK def startTagParamSource(self, token): self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def startTagHr(self, token): if self.tree.elementInScope("p", variant="button"): self.endTagP(impliedTagToken("p")) self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True self.parser.framesetOK = False def startTagImage(self, token): # No really... self.parser.parseError("unexpected-start-tag-treated-as", {"originalName": "image", "newName": "img"}) self.processStartTag(impliedTagToken("img", "StartTag", attributes=token["data"], selfClosing=token["selfClosing"])) def startTagIsIndex(self, token): self.parser.parseError("deprecated-tag", {"name": "isindex"}) if self.tree.formPointer: return form_attrs = {} if "action" in token["data"]: form_attrs["action"] = token["data"]["action"] self.processStartTag(impliedTagToken("form", "StartTag", attributes=form_attrs)) self.processStartTag(impliedTagToken("hr", "StartTag")) self.processStartTag(impliedTagToken("label", "StartTag")) # XXX Localization ... if "prompt" in token["data"]: prompt = token["data"]["prompt"] else: prompt = "This is a searchable index. Enter search keywords: " self.processCharacters( {"type": tokenTypes["Characters"], "data": prompt}) attributes = token["data"].copy() if "action" in attributes: del attributes["action"] if "prompt" in attributes: del attributes["prompt"] attributes["name"] = "isindex" self.processStartTag(impliedTagToken("input", "StartTag", attributes=attributes, selfClosing=token["selfClosing"])) self.processEndTag(impliedTagToken("label")) self.processStartTag(impliedTagToken("hr", "StartTag")) self.processEndTag(impliedTagToken("form")) def startTagTextarea(self, token): self.tree.insertElement(token) self.parser.tokenizer.state = self.parser.tokenizer.rcdataState self.processSpaceCharacters = self.processSpaceCharactersDropNewline self.parser.framesetOK = False def startTagIFrame(self, token): self.parser.framesetOK = False self.startTagRawtext(token) def startTagNoscript(self, token): if self.parser.scripting: self.startTagRawtext(token) else: self.startTagOther(token) def startTagRawtext(self, token): """iframe, noembed noframes, noscript(if scripting enabled)""" self.parser.parseRCDataRawtext(token, "RAWTEXT") def startTagOpt(self, token): if self.tree.openElements[-1].name == "option": self.parser.phase.processEndTag(impliedTagToken("option")) self.tree.reconstructActiveFormattingElements() self.parser.tree.insertElement(token) def startTagSelect(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertElement(token) self.parser.framesetOK = False if self.parser.phase in (self.parser.phases["inTable"], self.parser.phases["inCaption"], self.parser.phases["inColumnGroup"], self.parser.phases["inTableBody"], self.parser.phases["inRow"], self.parser.phases["inCell"]): self.parser.phase = self.parser.phases["inSelectInTable"] else: self.parser.phase = self.parser.phases["inSelect"] def startTagRpRt(self, token): if self.tree.elementInScope("ruby"): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "ruby": self.parser.parseError() self.tree.insertElement(token) def startTagMath(self, token): self.tree.reconstructActiveFormattingElements() self.parser.adjustMathMLAttributes(token) self.parser.adjustForeignAttributes(token) token["namespace"] = namespaces["mathml"] self.tree.insertElement(token) # Need to get the parse error right for the case where the token # has a namespace not equal to the xmlns attribute if token["selfClosing"]: self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def startTagSvg(self, token): self.tree.reconstructActiveFormattingElements() self.parser.adjustSVGAttributes(token) self.parser.adjustForeignAttributes(token) token["namespace"] = namespaces["svg"] self.tree.insertElement(token) # Need to get the parse error right for the case where the token # has a namespace not equal to the xmlns attribute if token["selfClosing"]: self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def startTagMisplaced(self, token): """ Elements that should be children of other elements that have a different insertion mode; here they are ignored "caption", "col", "colgroup", "frame", "frameset", "head", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "noscript" """ self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) def startTagOther(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertElement(token) def endTagP(self, token): if not self.tree.elementInScope("p", variant="button"): self.startTagCloseP(impliedTagToken("p", "StartTag")) self.parser.parseError("unexpected-end-tag", {"name": "p"}) self.endTagP(impliedTagToken("p", "EndTag")) else: self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": self.parser.parseError("unexpected-end-tag", {"name": "p"}) node = self.tree.openElements.pop() while node.name != "p": node = self.tree.openElements.pop() def endTagBody(self, token): if not self.tree.elementInScope("body"): self.parser.parseError() return elif self.tree.openElements[-1].name != "body": for node in self.tree.openElements[2:]: if node.name not in frozenset(("dd", "dt", "li", "optgroup", "option", "p", "rp", "rt", "tbody", "td", "tfoot", "th", "thead", "tr", "body", "html")): # Not sure this is the correct name for the parse error self.parser.parseError( "expected-one-end-tag-but-got-another", {"gotName": "body", "expectedName": node.name}) break self.parser.phase = self.parser.phases["afterBody"] def endTagHtml(self, token): # We repeat the test for the body end tag token being ignored here if self.tree.elementInScope("body"): self.endTagBody(impliedTagToken("body")) return token def endTagBlock(self, token): # Put us back in the right whitespace handling mode if token["name"] == "pre": self.processSpaceCharacters = self.processSpaceCharactersNonPre inScope = self.tree.elementInScope(token["name"]) if inScope: self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != token["name"]: self.parser.parseError("end-tag-too-early", {"name": token["name"]}) if inScope: node = self.tree.openElements.pop() while node.name != token["name"]: node = self.tree.openElements.pop() def endTagForm(self, token): node = self.tree.formPointer self.tree.formPointer = None if node is None or not self.tree.elementInScope(node): self.parser.parseError("unexpected-end-tag", {"name": "form"}) else: self.tree.generateImpliedEndTags() if self.tree.openElements[-1] != node: self.parser.parseError("end-tag-too-early-ignored", {"name": "form"}) self.tree.openElements.remove(node) def endTagListItem(self, token): if token["name"] == "li": variant = "list" else: variant = None if not self.tree.elementInScope(token["name"], variant=variant): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) else: self.tree.generateImpliedEndTags(exclude=token["name"]) if self.tree.openElements[-1].name != token["name"]: self.parser.parseError( "end-tag-too-early", {"name": token["name"]}) node = self.tree.openElements.pop() while node.name != token["name"]: node = self.tree.openElements.pop() def endTagHeading(self, token): for item in headingElements: if self.tree.elementInScope(item): self.tree.generateImpliedEndTags() break if self.tree.openElements[-1].name != token["name"]: self.parser.parseError("end-tag-too-early", {"name": token["name"]}) for item in headingElements: if self.tree.elementInScope(item): item = self.tree.openElements.pop() while item.name not in headingElements: item = self.tree.openElements.pop() break def endTagFormatting(self, token): """The much-feared adoption agency algorithm""" # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 # XXX Better parseError messages appreciated. # Step 1 outerLoopCounter = 0 # Step 2 while outerLoopCounter < 8: # Step 3 outerLoopCounter += 1 # Step 4: # Let the formatting element be the last element in # the list of active formatting elements that: # - is between the end of the list and the last scope # marker in the list, if any, or the start of the list # otherwise, and # - has the same tag name as the token. formattingElement = self.tree.elementInActiveFormattingElements( token["name"]) if (not formattingElement or (formattingElement in self.tree.openElements and not self.tree.elementInScope(formattingElement.name))): # If there is no such node, then abort these steps # and instead act as described in the "any other # end tag" entry below. self.endTagOther(token) return # Otherwise, if there is such a node, but that node is # not in the stack of open elements, then this is a # parse error; remove the element from the list, and # abort these steps. elif formattingElement not in self.tree.openElements: self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) self.tree.activeFormattingElements.remove(formattingElement) return # Otherwise, if there is such a node, and that node is # also in the stack of open elements, but the element # is not in scope, then this is a parse error; ignore # the token, and abort these steps. elif not self.tree.elementInScope(formattingElement.name): self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) return # Otherwise, there is a formatting element and that # element is in the stack and is in scope. If the # element is not the current node, this is a parse # error. In any case, proceed with the algorithm as # written in the following steps. else: if formattingElement != self.tree.openElements[-1]: self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) # Step 5: # Let the furthest block be the topmost node in the # stack of open elements that is lower in the stack # than the formatting element, and is an element in # the special category. There might not be one. afeIndex = self.tree.openElements.index(formattingElement) furthestBlock = None for element in self.tree.openElements[afeIndex:]: if element.nameTuple in specialElements: furthestBlock = element break # Step 6: # If there is no furthest block, then the UA must # first pop all the nodes from the bottom of the stack # of open elements, from the current node up to and # including the formatting element, then remove the # formatting element from the list of active # formatting elements, and finally abort these steps. if furthestBlock is None: element = self.tree.openElements.pop() while element != formattingElement: element = self.tree.openElements.pop() self.tree.activeFormattingElements.remove(element) return # Step 7 commonAncestor = self.tree.openElements[afeIndex - 1] # Step 8: # The bookmark is supposed to help us identify where to reinsert # nodes in step 15. We have to ensure that we reinsert nodes after # the node before the active formatting element. Note the bookmark # can move in step 9.7 bookmark = self.tree.activeFormattingElements.index(formattingElement) # Step 9 lastNode = node = furthestBlock innerLoopCounter = 0 index = self.tree.openElements.index(node) while innerLoopCounter < 3: innerLoopCounter += 1 # Node is element before node in open elements index -= 1 node = self.tree.openElements[index] if node not in self.tree.activeFormattingElements: self.tree.openElements.remove(node) continue # Step 9.6 if node == formattingElement: break # Step 9.7 if lastNode == furthestBlock: bookmark = self.tree.activeFormattingElements.index(node) + 1 # Step 9.8 clone = node.cloneNode() # Replace node with clone self.tree.activeFormattingElements[ self.tree.activeFormattingElements.index(node)] = clone self.tree.openElements[ self.tree.openElements.index(node)] = clone node = clone # Step 9.9 # Remove lastNode from its parents, if any if lastNode.parent: lastNode.parent.removeChild(lastNode) node.appendChild(lastNode) # Step 9.10 lastNode = node # Step 10 # Foster parent lastNode if commonAncestor is a # table, tbody, tfoot, thead, or tr we need to foster # parent the lastNode if lastNode.parent: lastNode.parent.removeChild(lastNode) if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): parent, insertBefore = self.tree.getTableMisnestedNodePosition() parent.insertBefore(lastNode, insertBefore) else: commonAncestor.appendChild(lastNode) # Step 11 clone = formattingElement.cloneNode() # Step 12 furthestBlock.reparentChildren(clone) # Step 13 furthestBlock.appendChild(clone) # Step 14 self.tree.activeFormattingElements.remove(formattingElement) self.tree.activeFormattingElements.insert(bookmark, clone) # Step 15 self.tree.openElements.remove(formattingElement) self.tree.openElements.insert( self.tree.openElements.index(furthestBlock) + 1, clone) def endTagAppletMarqueeObject(self, token): if self.tree.elementInScope(token["name"]): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != token["name"]: self.parser.parseError("end-tag-too-early", {"name": token["name"]}) if self.tree.elementInScope(token["name"]): element = self.tree.openElements.pop() while element.name != token["name"]: element = self.tree.openElements.pop() self.tree.clearActiveFormattingElements() def endTagBr(self, token): self.parser.parseError("unexpected-end-tag-treated-as", {"originalName": "br", "newName": "br element"}) self.tree.reconstructActiveFormattingElements() self.tree.insertElement(impliedTagToken("br", "StartTag")) self.tree.openElements.pop() def endTagOther(self, token): for node in self.tree.openElements[::-1]: if node.name == token["name"]: self.tree.generateImpliedEndTags(exclude=token["name"]) if self.tree.openElements[-1].name != token["name"]: self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while self.tree.openElements.pop() != node: pass break else: if node.nameTuple in specialElements: self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) break startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), (("base", "basefont", "bgsound", "command", "link", "meta", "script", "style", "title"), startTagProcessInHead), ("body", startTagBody), ("frameset", startTagFrameset), (("address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul"), startTagCloseP), (headingElements, startTagHeading), (("pre", "listing"), startTagPreListing), ("form", startTagForm), (("li", "dd", "dt"), startTagListItem), ("plaintext", startTagPlaintext), ("a", startTagA), (("b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u"), startTagFormatting), ("nobr", startTagNobr), ("button", startTagButton), (("applet", "marquee", "object"), startTagAppletMarqueeObject), ("xmp", startTagXmp), ("table", startTagTable), (("area", "br", "embed", "img", "keygen", "wbr"), startTagVoidFormatting), (("param", "source", "track"), startTagParamSource), ("input", startTagInput), ("hr", startTagHr), ("image", startTagImage), ("isindex", startTagIsIndex), ("textarea", startTagTextarea), ("iframe", startTagIFrame), ("noscript", startTagNoscript), (("noembed", "noframes"), startTagRawtext), ("select", startTagSelect), (("rp", "rt"), startTagRpRt), (("option", "optgroup"), startTagOpt), (("math"), startTagMath), (("svg"), startTagSvg), (("caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr"), startTagMisplaced) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("body", endTagBody), ("html", endTagHtml), (("address", "article", "aside", "blockquote", "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", "section", "summary", "ul"), endTagBlock), ("form", endTagForm), ("p", endTagP), (("dd", "dt", "li"), endTagListItem), (headingElements, endTagHeading), (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"), endTagFormatting), (("applet", "marquee", "object"), endTagAppletMarqueeObject), ("br", endTagBr), ]) endTagHandler.default = endTagOther class TextPhase(Phase): __slots__ = tuple() def processCharacters(self, token): self.tree.insertText(token["data"]) def processEOF(self): self.parser.parseError("expected-named-closing-tag-but-got-eof", {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() self.parser.phase = self.parser.originalPhase return True def startTagOther(self, token): assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] def endTagScript(self, token): node = self.tree.openElements.pop() assert node.name == "script" self.parser.phase = self.parser.originalPhase # The rest of this method is all stuff that only happens if # document.write works def endTagOther(self, token): self.tree.openElements.pop() self.parser.phase = self.parser.originalPhase startTagHandler = _utils.MethodDispatcher([]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("script", endTagScript)]) endTagHandler.default = endTagOther class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table __slots__ = tuple() # helper methods def clearStackToTableContext(self): # "clear the stack back to a table context" while self.tree.openElements[-1].name not in ("table", "html"): # self.parser.parseError("unexpected-implied-end-tag-in-table", # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() # When the current node is <html> it's an innerHTML case # processing methods def processEOF(self): if self.tree.openElements[-1].name != "html": self.parser.parseError("eof-in-table") else: assert self.parser.innerHTML # Stop parsing def processSpaceCharacters(self, token): originalPhase = self.parser.phase self.parser.phase = self.parser.phases["inTableText"] self.parser.phase.originalPhase = originalPhase self.parser.phase.processSpaceCharacters(token) def processCharacters(self, token): originalPhase = self.parser.phase self.parser.phase = self.parser.phases["inTableText"] self.parser.phase.originalPhase = originalPhase self.parser.phase.processCharacters(token) def insertText(self, token): # If we get here there must be at least one non-whitespace character # Do the table magic! self.tree.insertFromTable = True self.parser.phases["inBody"].processCharacters(token) self.tree.insertFromTable = False def startTagCaption(self, token): self.clearStackToTableContext() self.tree.activeFormattingElements.append(Marker) self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCaption"] def startTagColgroup(self, token): self.clearStackToTableContext() self.tree.insertElement(token) self.parser.phase = self.parser.phases["inColumnGroup"] def startTagCol(self, token): self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) return token def startTagRowGroup(self, token): self.clearStackToTableContext() self.tree.insertElement(token) self.parser.phase = self.parser.phases["inTableBody"] def startTagImplyTbody(self, token): self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) return token def startTagTable(self, token): self.parser.parseError("unexpected-start-tag-implies-end-tag", {"startName": "table", "endName": "table"}) self.parser.phase.processEndTag(impliedTagToken("table")) if not self.parser.innerHTML: return token def startTagStyleScript(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagInput(self, token): if ("type" in token["data"] and token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): self.parser.parseError("unexpected-hidden-input-in-table") self.tree.insertElement(token) # XXX associate with form self.tree.openElements.pop() else: self.startTagOther(token) def startTagForm(self, token): self.parser.parseError("unexpected-form-in-table") if self.tree.formPointer is None: self.tree.insertElement(token) self.tree.formPointer = self.tree.openElements[-1] self.tree.openElements.pop() def startTagOther(self, token): self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) # Do the table magic! self.tree.insertFromTable = True self.parser.phases["inBody"].processStartTag(token) self.tree.insertFromTable = False def endTagTable(self, token): if self.tree.elementInScope("table", variant="table"): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "table": self.parser.parseError("end-tag-too-early-named", {"gotName": "table", "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "table": self.tree.openElements.pop() self.tree.openElements.pop() self.parser.resetInsertionMode() else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def endTagIgnore(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def endTagOther(self, token): self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) # Do the table magic! self.tree.insertFromTable = True self.parser.phases["inBody"].processEndTag(token) self.tree.insertFromTable = False startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), ("caption", startTagCaption), ("colgroup", startTagColgroup), ("col", startTagCol), (("tbody", "tfoot", "thead"), startTagRowGroup), (("td", "th", "tr"), startTagImplyTbody), ("table", startTagTable), (("style", "script"), startTagStyleScript), ("input", startTagInput), ("form", startTagForm) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("table", endTagTable), (("body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"), endTagIgnore) ]) endTagHandler.default = endTagOther class InTableTextPhase(Phase): __slots__ = ("originalPhase", "characterTokens") def __init__(self, *args, **kwargs): super(InTableTextPhase, self).__init__(*args, **kwargs) self.originalPhase = None self.characterTokens = [] def flushCharacters(self): data = "".join([item["data"] for item in self.characterTokens]) if any([item not in spaceCharacters for item in data]): token = {"type": tokenTypes["Characters"], "data": data} self.parser.phases["inTable"].insertText(token) elif data: self.tree.insertText(data) self.characterTokens = [] def processComment(self, token): self.flushCharacters() self.parser.phase = self.originalPhase return token def processEOF(self): self.flushCharacters() self.parser.phase = self.originalPhase return True def processCharacters(self, token): if token["data"] == "\u0000": return self.characterTokens.append(token) def processSpaceCharacters(self, token): # pretty sure we should never reach here self.characterTokens.append(token) # assert False def processStartTag(self, token): self.flushCharacters() self.parser.phase = self.originalPhase return token def processEndTag(self, token): self.flushCharacters() self.parser.phase = self.originalPhase return token class InCaptionPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-caption __slots__ = tuple() def ignoreEndTagCaption(self): return not self.tree.elementInScope("caption", variant="table") def processEOF(self): self.parser.phases["inBody"].processEOF() def processCharacters(self, token): return self.parser.phases["inBody"].processCharacters(token) def startTagTableElement(self, token): self.parser.parseError() # XXX Have to duplicate logic here to find out if the tag is ignored ignoreEndTag = self.ignoreEndTagCaption() self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: return token def startTagOther(self, token): return self.parser.phases["inBody"].processStartTag(token) def endTagCaption(self, token): if not self.ignoreEndTagCaption(): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": self.parser.parseError("expected-one-end-tag-but-got-another", {"gotName": "caption", "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "caption": self.tree.openElements.pop() self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inTable"] else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def endTagTable(self, token): self.parser.parseError() ignoreEndTag = self.ignoreEndTagCaption() self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: return token def endTagIgnore(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def endTagOther(self, token): return self.parser.phases["inBody"].processEndTag(token) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), startTagTableElement) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("caption", endTagCaption), ("table", endTagTable), (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"), endTagIgnore) ]) endTagHandler.default = endTagOther class InColumnGroupPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-column __slots__ = tuple() def ignoreEndTagColgroup(self): return self.tree.openElements[-1].name == "html" def processEOF(self): if self.tree.openElements[-1].name == "html": assert self.parser.innerHTML return else: ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup(impliedTagToken("colgroup")) if not ignoreEndTag: return True def processCharacters(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup(impliedTagToken("colgroup")) if not ignoreEndTag: return token def startTagCol(self, token): self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def startTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup(impliedTagToken("colgroup")) if not ignoreEndTag: return token def endTagColgroup(self, token): if self.ignoreEndTagColgroup(): # innerHTML case assert self.parser.innerHTML self.parser.parseError() else: self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] def endTagCol(self, token): self.parser.parseError("no-end-tag", {"name": "col"}) def endTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup(impliedTagToken("colgroup")) if not ignoreEndTag: return token startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), ("col", startTagCol) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("colgroup", endTagColgroup), ("col", endTagCol) ]) endTagHandler.default = endTagOther class InTableBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 __slots__ = tuple() # helper methods def clearStackToTableBodyContext(self): while self.tree.openElements[-1].name not in ("tbody", "tfoot", "thead", "html"): # self.parser.parseError("unexpected-implied-end-tag-in-table", # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() if self.tree.openElements[-1].name == "html": assert self.parser.innerHTML # the rest def processEOF(self): self.parser.phases["inTable"].processEOF() def processSpaceCharacters(self, token): return self.parser.phases["inTable"].processSpaceCharacters(token) def processCharacters(self, token): return self.parser.phases["inTable"].processCharacters(token) def startTagTr(self, token): self.clearStackToTableBodyContext() self.tree.insertElement(token) self.parser.phase = self.parser.phases["inRow"] def startTagTableCell(self, token): self.parser.parseError("unexpected-cell-in-table-body", {"name": token["name"]}) self.startTagTr(impliedTagToken("tr", "StartTag")) return token def startTagTableOther(self, token): # XXX AT Any ideas on how to share this with endTagTable? if (self.tree.elementInScope("tbody", variant="table") or self.tree.elementInScope("thead", variant="table") or self.tree.elementInScope("tfoot", variant="table")): self.clearStackToTableBodyContext() self.endTagTableRowGroup( impliedTagToken(self.tree.openElements[-1].name)) return token else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def startTagOther(self, token): return self.parser.phases["inTable"].processStartTag(token) def endTagTableRowGroup(self, token): if self.tree.elementInScope(token["name"], variant="table"): self.clearStackToTableBodyContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] else: self.parser.parseError("unexpected-end-tag-in-table-body", {"name": token["name"]}) def endTagTable(self, token): if (self.tree.elementInScope("tbody", variant="table") or self.tree.elementInScope("thead", variant="table") or self.tree.elementInScope("tfoot", variant="table")): self.clearStackToTableBodyContext() self.endTagTableRowGroup( impliedTagToken(self.tree.openElements[-1].name)) return token else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def endTagIgnore(self, token): self.parser.parseError("unexpected-end-tag-in-table-body", {"name": token["name"]}) def endTagOther(self, token): return self.parser.phases["inTable"].processEndTag(token) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), ("tr", startTagTr), (("td", "th"), startTagTableCell), (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), startTagTableOther) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ (("tbody", "tfoot", "thead"), endTagTableRowGroup), ("table", endTagTable), (("body", "caption", "col", "colgroup", "html", "td", "th", "tr"), endTagIgnore) ]) endTagHandler.default = endTagOther class InRowPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-row __slots__ = tuple() # helper methods (XXX unify this with other table helper methods) def clearStackToTableRowContext(self): while self.tree.openElements[-1].name not in ("tr", "html"): self.parser.parseError("unexpected-implied-end-tag-in-table-row", {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() def ignoreEndTagTr(self): return not self.tree.elementInScope("tr", variant="table") # the rest def processEOF(self): self.parser.phases["inTable"].processEOF() def processSpaceCharacters(self, token): return self.parser.phases["inTable"].processSpaceCharacters(token) def processCharacters(self, token): return self.parser.phases["inTable"].processCharacters(token) def startTagTableCell(self, token): self.clearStackToTableRowContext() self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCell"] self.tree.activeFormattingElements.append(Marker) def startTagTableOther(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr(impliedTagToken("tr")) # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: return token def startTagOther(self, token): return self.parser.phases["inTable"].processStartTag(token) def endTagTr(self, token): if not self.ignoreEndTagTr(): self.clearStackToTableRowContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTableBody"] else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def endTagTable(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr(impliedTagToken("tr")) # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: return token def endTagTableRowGroup(self, token): if self.tree.elementInScope(token["name"], variant="table"): self.endTagTr(impliedTagToken("tr")) return token else: self.parser.parseError() def endTagIgnore(self, token): self.parser.parseError("unexpected-end-tag-in-table-row", {"name": token["name"]}) def endTagOther(self, token): return self.parser.phases["inTable"].processEndTag(token) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), (("td", "th"), startTagTableCell), (("caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr"), startTagTableOther) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("tr", endTagTr), ("table", endTagTable), (("tbody", "tfoot", "thead"), endTagTableRowGroup), (("body", "caption", "col", "colgroup", "html", "td", "th"), endTagIgnore) ]) endTagHandler.default = endTagOther class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell __slots__ = tuple() # helper def closeCell(self): if self.tree.elementInScope("td", variant="table"): self.endTagTableCell(impliedTagToken("td")) elif self.tree.elementInScope("th", variant="table"): self.endTagTableCell(impliedTagToken("th")) # the rest def processEOF(self): self.parser.phases["inBody"].processEOF() def processCharacters(self, token): return self.parser.phases["inBody"].processCharacters(token) def startTagTableOther(self, token): if (self.tree.elementInScope("td", variant="table") or self.tree.elementInScope("th", variant="table")): self.closeCell() return token else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def startTagOther(self, token): return self.parser.phases["inBody"].processStartTag(token) def endTagTableCell(self, token): if self.tree.elementInScope(token["name"], variant="table"): self.tree.generateImpliedEndTags(token["name"]) if self.tree.openElements[-1].name != token["name"]: self.parser.parseError("unexpected-cell-end-tag", {"name": token["name"]}) while True: node = self.tree.openElements.pop() if node.name == token["name"]: break else: self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inRow"] else: self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def endTagIgnore(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def endTagImply(self, token): if self.tree.elementInScope(token["name"], variant="table"): self.closeCell() return token else: # sometimes innerHTML case self.parser.parseError() def endTagOther(self, token): return self.parser.phases["inBody"].processEndTag(token) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), startTagTableOther) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ (("td", "th"), endTagTableCell), (("body", "caption", "col", "colgroup", "html"), endTagIgnore), (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) ]) endTagHandler.default = endTagOther class InSelectPhase(Phase): __slots__ = tuple() # http://www.whatwg.org/specs/web-apps/current-work/#in-select def processEOF(self): if self.tree.openElements[-1].name != "html": self.parser.parseError("eof-in-select") else: assert self.parser.innerHTML def processCharacters(self, token): if token["data"] == "\u0000": return self.tree.insertText(token["data"]) def startTagOption(self, token): # We need to imply </option> if <option> is the current node. if self.tree.openElements[-1].name == "option": self.tree.openElements.pop() self.tree.insertElement(token) def startTagOptgroup(self, token): if self.tree.openElements[-1].name == "option": self.tree.openElements.pop() if self.tree.openElements[-1].name == "optgroup": self.tree.openElements.pop() self.tree.insertElement(token) def startTagSelect(self, token): self.parser.parseError("unexpected-select-in-select") self.endTagSelect(impliedTagToken("select")) def startTagInput(self, token): self.parser.parseError("unexpected-input-in-select") if self.tree.elementInScope("select", variant="select"): self.endTagSelect(impliedTagToken("select")) return token else: assert self.parser.innerHTML def startTagScript(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagOther(self, token): self.parser.parseError("unexpected-start-tag-in-select", {"name": token["name"]}) def endTagOption(self, token): if self.tree.openElements[-1].name == "option": self.tree.openElements.pop() else: self.parser.parseError("unexpected-end-tag-in-select", {"name": "option"}) def endTagOptgroup(self, token): # </optgroup> implicitly closes <option> if (self.tree.openElements[-1].name == "option" and self.tree.openElements[-2].name == "optgroup"): self.tree.openElements.pop() # It also closes </optgroup> if self.tree.openElements[-1].name == "optgroup": self.tree.openElements.pop() # But nothing else else: self.parser.parseError("unexpected-end-tag-in-select", {"name": "optgroup"}) def endTagSelect(self, token): if self.tree.elementInScope("select", variant="select"): node = self.tree.openElements.pop() while node.name != "select": node = self.tree.openElements.pop() self.parser.resetInsertionMode() else: # innerHTML case assert self.parser.innerHTML self.parser.parseError() def endTagOther(self, token): self.parser.parseError("unexpected-end-tag-in-select", {"name": token["name"]}) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), ("option", startTagOption), ("optgroup", startTagOptgroup), ("select", startTagSelect), (("input", "keygen", "textarea"), startTagInput), ("script", startTagScript) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("option", endTagOption), ("optgroup", endTagOptgroup), ("select", endTagSelect) ]) endTagHandler.default = endTagOther class InSelectInTablePhase(Phase): __slots__ = tuple() def processEOF(self): self.parser.phases["inSelect"].processEOF() def processCharacters(self, token): return self.parser.phases["inSelect"].processCharacters(token) def startTagTable(self, token): self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) self.endTagOther(impliedTagToken("select")) return token def startTagOther(self, token): return self.parser.phases["inSelect"].processStartTag(token) def endTagTable(self, token): self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) if self.tree.elementInScope(token["name"], variant="table"): self.endTagOther(impliedTagToken("select")) return token def endTagOther(self, token): return self.parser.phases["inSelect"].processEndTag(token) startTagHandler = _utils.MethodDispatcher([ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), startTagTable) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), endTagTable) ]) endTagHandler.default = endTagOther class InForeignContentPhase(Phase): __slots__ = tuple() breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", "table", "tt", "u", "ul", "var"]) def adjustSVGTagNames(self, token): replacements = {"altglyph": "altGlyph", "altglyphdef": "altGlyphDef", "altglyphitem": "altGlyphItem", "animatecolor": "animateColor", "animatemotion": "animateMotion", "animatetransform": "animateTransform", "clippath": "clipPath", "feblend": "feBlend", "fecolormatrix": "feColorMatrix", "fecomponenttransfer": "feComponentTransfer", "fecomposite": "feComposite", "feconvolvematrix": "feConvolveMatrix", "fediffuselighting": "feDiffuseLighting", "fedisplacementmap": "feDisplacementMap", "fedistantlight": "feDistantLight", "feflood": "feFlood", "fefunca": "feFuncA", "fefuncb": "feFuncB", "fefuncg": "feFuncG", "fefuncr": "feFuncR", "fegaussianblur": "feGaussianBlur", "feimage": "feImage", "femerge": "feMerge", "femergenode": "feMergeNode", "femorphology": "feMorphology", "feoffset": "feOffset", "fepointlight": "fePointLight", "fespecularlighting": "feSpecularLighting", "fespotlight": "feSpotLight", "fetile": "feTile", "feturbulence": "feTurbulence", "foreignobject": "foreignObject", "glyphref": "glyphRef", "lineargradient": "linearGradient", "radialgradient": "radialGradient", "textpath": "textPath"} if token["name"] in replacements: token["name"] = replacements[token["name"]] def processCharacters(self, token): if token["data"] == "\u0000": token["data"] = "\uFFFD" elif (self.parser.framesetOK and any(char not in spaceCharacters for char in token["data"])): self.parser.framesetOK = False Phase.processCharacters(self, token) def processStartTag(self, token): currentNode = self.tree.openElements[-1] if (token["name"] in self.breakoutElements or (token["name"] == "font" and set(token["data"].keys()) & {"color", "face", "size"})): self.parser.parseError("unexpected-html-element-in-foreign-content", {"name": token["name"]}) while (self.tree.openElements[-1].namespace != self.tree.defaultNamespace and not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): self.tree.openElements.pop() return token else: if currentNode.namespace == namespaces["mathml"]: self.parser.adjustMathMLAttributes(token) elif currentNode.namespace == namespaces["svg"]: self.adjustSVGTagNames(token) self.parser.adjustSVGAttributes(token) self.parser.adjustForeignAttributes(token) token["namespace"] = currentNode.namespace self.tree.insertElement(token) if token["selfClosing"]: self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def processEndTag(self, token): nodeIndex = len(self.tree.openElements) - 1 node = self.tree.openElements[-1] if node.name.translate(asciiUpper2Lower) != token["name"]: self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while True: if node.name.translate(asciiUpper2Lower) == token["name"]: # XXX this isn't in the spec but it seems necessary if self.parser.phase == self.parser.phases["inTableText"]: self.parser.phase.flushCharacters() self.parser.phase = self.parser.phase.originalPhase while self.tree.openElements.pop() != node: assert self.tree.openElements new_token = None break nodeIndex -= 1 node = self.tree.openElements[nodeIndex] if node.namespace != self.tree.defaultNamespace: continue else: new_token = self.parser.phase.processEndTag(token) break return new_token class AfterBodyPhase(Phase): __slots__ = tuple() def processEOF(self): # Stop parsing pass def processComment(self, token): # This is needed because data is to be appended to the <html> element # here and not to whatever is currently open. self.tree.insertComment(token, self.tree.openElements[0]) def processCharacters(self, token): self.parser.parseError("unexpected-char-after-body") self.parser.phase = self.parser.phases["inBody"] return token def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagOther(self, token): self.parser.parseError("unexpected-start-tag-after-body", {"name": token["name"]}) self.parser.phase = self.parser.phases["inBody"] return token def endTagHtml(self, name): if self.parser.innerHTML: self.parser.parseError("unexpected-end-tag-after-body-innerhtml") else: self.parser.phase = self.parser.phases["afterAfterBody"] def endTagOther(self, token): self.parser.parseError("unexpected-end-tag-after-body", {"name": token["name"]}) self.parser.phase = self.parser.phases["inBody"] return token startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) endTagHandler.default = endTagOther class InFramesetPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset __slots__ = tuple() def processEOF(self): if self.tree.openElements[-1].name != "html": self.parser.parseError("eof-in-frameset") else: assert self.parser.innerHTML def processCharacters(self, token): self.parser.parseError("unexpected-char-in-frameset") def startTagFrameset(self, token): self.tree.insertElement(token) def startTagFrame(self, token): self.tree.insertElement(token) self.tree.openElements.pop() def startTagNoframes(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagOther(self, token): self.parser.parseError("unexpected-start-tag-in-frameset", {"name": token["name"]}) def endTagFrameset(self, token): if self.tree.openElements[-1].name == "html": # innerHTML case self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") else: self.tree.openElements.pop() if (not self.parser.innerHTML and self.tree.openElements[-1].name != "frameset"): # If we're not in innerHTML mode and the current node is not a # "frameset" element (anymore) then switch. self.parser.phase = self.parser.phases["afterFrameset"] def endTagOther(self, token): self.parser.parseError("unexpected-end-tag-in-frameset", {"name": token["name"]}) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), ("frameset", startTagFrameset), ("frame", startTagFrame), ("noframes", startTagNoframes) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("frameset", endTagFrameset) ]) endTagHandler.default = endTagOther class AfterFramesetPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#after3 __slots__ = tuple() def processEOF(self): # Stop parsing pass def processCharacters(self, token): self.parser.parseError("unexpected-char-after-frameset") def startTagNoframes(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagOther(self, token): self.parser.parseError("unexpected-start-tag-after-frameset", {"name": token["name"]}) def endTagHtml(self, token): self.parser.phase = self.parser.phases["afterAfterFrameset"] def endTagOther(self, token): self.parser.parseError("unexpected-end-tag-after-frameset", {"name": token["name"]}) startTagHandler = _utils.MethodDispatcher([ ("html", Phase.startTagHtml), ("noframes", startTagNoframes) ]) startTagHandler.default = startTagOther endTagHandler = _utils.MethodDispatcher([ ("html", endTagHtml) ]) endTagHandler.default = endTagOther class AfterAfterBodyPhase(Phase): __slots__ = tuple() def processEOF(self): pass def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processSpaceCharacters(self, token): return self.parser.phases["inBody"].processSpaceCharacters(token) def processCharacters(self, token): self.parser.parseError("expected-eof-but-got-char") self.parser.phase = self.parser.phases["inBody"] return token def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagOther(self, token): self.parser.parseError("expected-eof-but-got-start-tag", {"name": token["name"]}) self.parser.phase = self.parser.phases["inBody"] return token def processEndTag(self, token): self.parser.parseError("expected-eof-but-got-end-tag", {"name": token["name"]}) self.parser.phase = self.parser.phases["inBody"] return token startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml) ]) startTagHandler.default = startTagOther class AfterAfterFramesetPhase(Phase): __slots__ = tuple() def processEOF(self): pass def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processSpaceCharacters(self, token): return self.parser.phases["inBody"].processSpaceCharacters(token) def processCharacters(self, token): self.parser.parseError("expected-eof-but-got-char") def startTagHtml(self, token): return self.parser.phases["inBody"].processStartTag(token) def startTagNoFrames(self, token): return self.parser.phases["inHead"].processStartTag(token) def startTagOther(self, token): self.parser.parseError("expected-eof-but-got-start-tag", {"name": token["name"]}) def processEndTag(self, token): self.parser.parseError("expected-eof-but-got-end-tag", {"name": token["name"]}) startTagHandler = _utils.MethodDispatcher([ ("html", startTagHtml), ("noframes", startTagNoFrames) ]) startTagHandler.default = startTagOther # pylint:enable=unused-argument return { "initial": InitialPhase, "beforeHtml": BeforeHtmlPhase, "beforeHead": BeforeHeadPhase, "inHead": InHeadPhase, "inHeadNoscript": InHeadNoscriptPhase, "afterHead": AfterHeadPhase, "inBody": InBodyPhase, "text": TextPhase, "inTable": InTablePhase, "inTableText": InTableTextPhase, "inCaption": InCaptionPhase, "inColumnGroup": InColumnGroupPhase, "inTableBody": InTableBodyPhase, "inRow": InRowPhase, "inCell": InCellPhase, "inSelect": InSelectPhase, "inSelectInTable": InSelectInTablePhase, "inForeignContent": InForeignContentPhase, "afterBody": AfterBodyPhase, "inFrameset": InFramesetPhase, "afterFrameset": AfterFramesetPhase, "afterAfterBody": AfterAfterBodyPhase, "afterAfterFrameset": AfterAfterFramesetPhase, # XXX after after frameset } def adjust_attributes(token, replacements): needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) if needs_adjustment: token['data'] = type(token['data'])((replacements.get(k, k), v) for k, v in token['data'].items()) def impliedTagToken(name, type="EndTag", attributes=None, selfClosing=False): if attributes is None: attributes = {} return {"type": tokenTypes[type], "name": name, "data": attributes, "selfClosing": selfClosing} class ParseError(Exception): """Error in parsed document""" pass