1from __future__ import absolute_import, division, unicode_literals
2from six import with_metaclass
3
4import types
5
6from . import inputstream
7from . import tokenizer
8
9from . import treebuilders
10from .treebuilders._base import Marker
11
12from . import utils
13from . import constants
14from .constants import spaceCharacters, asciiUpper2Lower
15from .constants import specialElements
16from .constants import headingElements
17from .constants import cdataElements, rcdataElements
18from .constants import tokenTypes, ReparseException, namespaces
19from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
20from .constants import adjustForeignAttributes as adjustForeignAttributesMap
21from .constants import E
22
23
24def parse(doc, treebuilder="etree", encoding=None,
25          namespaceHTMLElements=True):
26    """Parse a string or file-like object into a tree"""
27    tb = treebuilders.getTreeBuilder(treebuilder)
28    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
29    return p.parse(doc, encoding=encoding)
30
31
32def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
33                  namespaceHTMLElements=True):
34    tb = treebuilders.getTreeBuilder(treebuilder)
35    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
36    return p.parseFragment(doc, container=container, encoding=encoding)
37
38
39def method_decorator_metaclass(function):
40    class Decorated(type):
41        def __new__(meta, classname, bases, classDict):
42            for attributeName, attribute in classDict.items():
43                if isinstance(attribute, types.FunctionType):
44                    attribute = function(attribute)
45
46                classDict[attributeName] = attribute
47            return type.__new__(meta, classname, bases, classDict)
48    return Decorated
49
50
51class HTMLParser(object):
52    """HTML parser. Generates a tree structure from a stream of (possibly
53        malformed) HTML"""
54
55    def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
56                 strict=False, namespaceHTMLElements=True, debug=False):
57        """
58        strict - raise an exception when a parse error is encountered
59
60        tree - a treebuilder class controlling the type of tree that will be
61        returned. Built in treebuilders can be accessed through
62        html5lib.treebuilders.getTreeBuilder(treeType)
63
64        tokenizer - a class that provides a stream of tokens to the treebuilder.
65        This may be replaced for e.g. a sanitizer which converts some tags to
66        text
67        """
68
69        # Raise an exception on the first error encountered
70        self.strict = strict
71
72        if tree is None:
73            tree = treebuilders.getTreeBuilder("etree")
74        self.tree = tree(namespaceHTMLElements)
75        self.tokenizer_class = tokenizer
76        self.errors = []
77
78        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
79                            getPhases(debug).items()])
80
81    def _parse(self, stream, innerHTML=False, container="div",
82               encoding=None, parseMeta=True, useChardet=True, **kwargs):
83
84        self.innerHTMLMode = innerHTML
85        self.container = container
86        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
87                                              parseMeta=parseMeta,
88                                              useChardet=useChardet,
89                                              parser=self, **kwargs)
90        self.reset()
91
92        while True:
93            try:
94                self.mainLoop()
95                break
96            except ReparseException:
97                self.reset()
98
99    def reset(self):
100        self.tree.reset()
101        self.firstStartTag = False
102        self.errors = []
103        self.log = []  # only used with debug mode
104        # "quirks" / "limited quirks" / "no quirks"
105        self.compatMode = "no quirks"
106
107        if self.innerHTMLMode:
108            self.innerHTML = self.container.lower()
109
110            if self.innerHTML in cdataElements:
111                self.tokenizer.state = self.tokenizer.rcdataState
112            elif self.innerHTML in rcdataElements:
113                self.tokenizer.state = self.tokenizer.rawtextState
114            elif self.innerHTML == 'plaintext':
115                self.tokenizer.state = self.tokenizer.plaintextState
116            else:
117                # state already is data state
118                # self.tokenizer.state = self.tokenizer.dataState
119                pass
120            self.phase = self.phases["beforeHtml"]
121            self.phase.insertHtmlElement()
122            self.resetInsertionMode()
123        else:
124            self.innerHTML = False
125            self.phase = self.phases["initial"]
126
127        self.lastPhase = None
128
129        self.beforeRCDataPhase = None
130
131        self.framesetOK = True
132
133    @property
134    def documentEncoding(self):
135        """The name of the character encoding
136        that was used to decode the input stream,
137        or :obj:`None` if that is not determined yet.
138
139        """
140        if not hasattr(self, 'tokenizer'):
141            return None
142        return self.tokenizer.stream.charEncoding[0]
143
144    def isHTMLIntegrationPoint(self, element):
145        if (element.name == "annotation-xml" and
146                element.namespace == namespaces["mathml"]):
147            return ("encoding" in element.attributes and
148                    element.attributes["encoding"].translate(
149                        asciiUpper2Lower) in
150                    ("text/html", "application/xhtml+xml"))
151        else:
152            return (element.namespace, element.name) in htmlIntegrationPointElements
153
154    def isMathMLTextIntegrationPoint(self, element):
155        return (element.namespace, element.name) in mathmlTextIntegrationPointElements
156
157    def mainLoop(self):
158        CharactersToken = tokenTypes["Characters"]
159        SpaceCharactersToken = tokenTypes["SpaceCharacters"]
160        StartTagToken = tokenTypes["StartTag"]
161        EndTagToken = tokenTypes["EndTag"]
162        CommentToken = tokenTypes["Comment"]
163        DoctypeToken = tokenTypes["Doctype"]
164        ParseErrorToken = tokenTypes["ParseError"]
165
166        for token in self.normalizedTokens():
167            new_token = token
168            while new_token is not None:
169                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
170                currentNodeNamespace = currentNode.namespace if currentNode else None
171                currentNodeName = currentNode.name if currentNode else None
172
173                type = new_token["type"]
174
175                if type == ParseErrorToken:
176                    self.parseError(new_token["data"], new_token.get("datavars", {}))
177                    new_token = None
178                else:
179                    if (len(self.tree.openElements) == 0 or
180                        currentNodeNamespace == self.tree.defaultNamespace or
181                        (self.isMathMLTextIntegrationPoint(currentNode) and
182                         ((type == StartTagToken and
183                           token["name"] not in frozenset(["mglyph", "malignmark"])) or
184                          type in (CharactersToken, SpaceCharactersToken))) or
185                        (currentNodeNamespace == namespaces["mathml"] and
186                         currentNodeName == "annotation-xml" and
187                         token["name"] == "svg") or
188                        (self.isHTMLIntegrationPoint(currentNode) and
189                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
190                        phase = self.phase
191                    else:
192                        phase = self.phases["inForeignContent"]
193
194                    if type == CharactersToken:
195                        new_token = phase.processCharacters(new_token)
196                    elif type == SpaceCharactersToken:
197                        new_token = phase.processSpaceCharacters(new_token)
198                    elif type == StartTagToken:
199                        new_token = phase.processStartTag(new_token)
200                    elif type == EndTagToken:
201                        new_token = phase.processEndTag(new_token)
202                    elif type == CommentToken:
203                        new_token = phase.processComment(new_token)
204                    elif type == DoctypeToken:
205                        new_token = phase.processDoctype(new_token)
206
207            if (type == StartTagToken and token["selfClosing"]
208                    and not token["selfClosingAcknowledged"]):
209                self.parseError("non-void-element-with-trailing-solidus",
210                                {"name": token["name"]})
211
212        # When the loop finishes it's EOF
213        reprocess = True
214        phases = []
215        while reprocess:
216            phases.append(self.phase)
217            reprocess = self.phase.processEOF()
218            if reprocess:
219                assert self.phase not in phases
220
221    def normalizedTokens(self):
222        for token in self.tokenizer:
223            yield self.normalizeToken(token)
224
225    def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
226        """Parse a HTML document into a well-formed tree
227
228        stream - a filelike object or string containing the HTML to be parsed
229
230        The optional encoding parameter must be a string that indicates
231        the encoding.  If specified, that encoding will be used,
232        regardless of any BOM or later declaration (such as in a meta
233        element)
234        """
235        self._parse(stream, innerHTML=False, encoding=encoding,
236                    parseMeta=parseMeta, useChardet=useChardet)
237        return self.tree.getDocument()
238
239    def parseFragment(self, stream, container="div", encoding=None,
240                      parseMeta=False, useChardet=True):
241        """Parse a HTML fragment into a well-formed tree fragment
242
243        container - name of the element we're setting the innerHTML property
244        if set to None, default to 'div'
245
246        stream - a filelike object or string containing the HTML to be parsed
247
248        The optional encoding parameter must be a string that indicates
249        the encoding.  If specified, that encoding will be used,
250        regardless of any BOM or later declaration (such as in a meta
251        element)
252        """
253        self._parse(stream, True, container=container, encoding=encoding)
254        return self.tree.getFragment()
255
256    def parseError(self, errorcode="XXX-undefined-error", datavars={}):
257        # XXX The idea is to make errorcode mandatory.
258        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
259        if self.strict:
260            raise ParseError(E[errorcode] % datavars)
261
262    def normalizeToken(self, token):
263        """ HTML5 specific normalizations to the token stream """
264
265        if token["type"] == tokenTypes["StartTag"]:
266            token["data"] = dict(token["data"][::-1])
267
268        return token
269
270    def adjustMathMLAttributes(self, token):
271        replacements = {"definitionurl": "definitionURL"}
272        for k, v in replacements.items():
273            if k in token["data"]:
274                token["data"][v] = token["data"][k]
275                del token["data"][k]
276
277    def adjustSVGAttributes(self, token):
278        replacements = {
279            "attributename": "attributeName",
280            "attributetype": "attributeType",
281            "basefrequency": "baseFrequency",
282            "baseprofile": "baseProfile",
283            "calcmode": "calcMode",
284            "clippathunits": "clipPathUnits",
285            "contentscripttype": "contentScriptType",
286            "contentstyletype": "contentStyleType",
287            "diffuseconstant": "diffuseConstant",
288            "edgemode": "edgeMode",
289            "externalresourcesrequired": "externalResourcesRequired",
290            "filterres": "filterRes",
291            "filterunits": "filterUnits",
292            "glyphref": "glyphRef",
293            "gradienttransform": "gradientTransform",
294            "gradientunits": "gradientUnits",
295            "kernelmatrix": "kernelMatrix",
296            "kernelunitlength": "kernelUnitLength",
297            "keypoints": "keyPoints",
298            "keysplines": "keySplines",
299            "keytimes": "keyTimes",
300            "lengthadjust": "lengthAdjust",
301            "limitingconeangle": "limitingConeAngle",
302            "markerheight": "markerHeight",
303            "markerunits": "markerUnits",
304            "markerwidth": "markerWidth",
305            "maskcontentunits": "maskContentUnits",
306            "maskunits": "maskUnits",
307            "numoctaves": "numOctaves",
308            "pathlength": "pathLength",
309            "patterncontentunits": "patternContentUnits",
310            "patterntransform": "patternTransform",
311            "patternunits": "patternUnits",
312            "pointsatx": "pointsAtX",
313            "pointsaty": "pointsAtY",
314            "pointsatz": "pointsAtZ",
315            "preservealpha": "preserveAlpha",
316            "preserveaspectratio": "preserveAspectRatio",
317            "primitiveunits": "primitiveUnits",
318            "refx": "refX",
319            "refy": "refY",
320            "repeatcount": "repeatCount",
321            "repeatdur": "repeatDur",
322            "requiredextensions": "requiredExtensions",
323            "requiredfeatures": "requiredFeatures",
324            "specularconstant": "specularConstant",
325            "specularexponent": "specularExponent",
326            "spreadmethod": "spreadMethod",
327            "startoffset": "startOffset",
328            "stddeviation": "stdDeviation",
329            "stitchtiles": "stitchTiles",
330            "surfacescale": "surfaceScale",
331            "systemlanguage": "systemLanguage",
332            "tablevalues": "tableValues",
333            "targetx": "targetX",
334            "targety": "targetY",
335            "textlength": "textLength",
336            "viewbox": "viewBox",
337            "viewtarget": "viewTarget",
338            "xchannelselector": "xChannelSelector",
339            "ychannelselector": "yChannelSelector",
340            "zoomandpan": "zoomAndPan"
341        }
342        for originalName in list(token["data"].keys()):
343            if originalName in replacements:
344                svgName = replacements[originalName]
345                token["data"][svgName] = token["data"][originalName]
346                del token["data"][originalName]
347
348    def adjustForeignAttributes(self, token):
349        replacements = adjustForeignAttributesMap
350
351        for originalName in token["data"].keys():
352            if originalName in replacements:
353                foreignName = replacements[originalName]
354                token["data"][foreignName] = token["data"][originalName]
355                del token["data"][originalName]
356
357    def reparseTokenNormal(self, token):
358        self.parser.phase()
359
360    def resetInsertionMode(self):
361        # The name of this method is mostly historical. (It's also used in the
362        # specification.)
363        last = False
364        newModes = {
365            "select": "inSelect",
366            "td": "inCell",
367            "th": "inCell",
368            "tr": "inRow",
369            "tbody": "inTableBody",
370            "thead": "inTableBody",
371            "tfoot": "inTableBody",
372            "caption": "inCaption",
373            "colgroup": "inColumnGroup",
374            "table": "inTable",
375            "head": "inBody",
376            "body": "inBody",
377            "frameset": "inFrameset",
378            "html": "beforeHead"
379        }
380        for node in self.tree.openElements[::-1]:
381            nodeName = node.name
382            new_phase = None
383            if node == self.tree.openElements[0]:
384                assert self.innerHTML
385                last = True
386                nodeName = self.innerHTML
387            # Check for conditions that should only happen in the innerHTML
388            # case
389            if nodeName in ("select", "colgroup", "head", "html"):
390                assert self.innerHTML
391
392            if not last and node.namespace != self.tree.defaultNamespace:
393                continue
394
395            if nodeName in newModes:
396                new_phase = self.phases[newModes[nodeName]]
397                break
398            elif last:
399                new_phase = self.phases["inBody"]
400                break
401
402        self.phase = new_phase
403
404    def parseRCDataRawtext(self, token, contentType):
405        """Generic RCDATA/RAWTEXT Parsing algorithm
406        contentType - RCDATA or RAWTEXT
407        """
408        assert contentType in ("RAWTEXT", "RCDATA")
409
410        self.tree.insertElement(token)
411
412        if contentType == "RAWTEXT":
413            self.tokenizer.state = self.tokenizer.rawtextState
414        else:
415            self.tokenizer.state = self.tokenizer.rcdataState
416
417        self.originalPhase = self.phase
418
419        self.phase = self.phases["text"]
420
421
422def getPhases(debug):
423    def log(function):
424        """Logger that records which phase processes each token"""
425        type_names = dict((value, key) for key, value in
426                          constants.tokenTypes.items())
427
428        def wrapped(self, *args, **kwargs):
429            if function.__name__.startswith("process") and len(args) > 0:
430                token = args[0]
431                try:
432                    info = {"type": type_names[token['type']]}
433                except:
434                    raise
435                if token['type'] in constants.tagTokenTypes:
436                    info["name"] = token['name']
437
438                self.parser.log.append((self.parser.tokenizer.state.__name__,
439                                        self.parser.phase.__class__.__name__,
440                                        self.__class__.__name__,
441                                        function.__name__,
442                                        info))
443                return function(self, *args, **kwargs)
444            else:
445                return function(self, *args, **kwargs)
446        return wrapped
447
448    def getMetaclass(use_metaclass, metaclass_func):
449        if use_metaclass:
450            return method_decorator_metaclass(metaclass_func)
451        else:
452            return type
453
454    class Phase(with_metaclass(getMetaclass(debug, log))):
455        """Base class for helper object that implements each phase of processing
456        """
457
458        def __init__(self, parser, tree):
459            self.parser = parser
460            self.tree = tree
461
462        def processEOF(self):
463            raise NotImplementedError
464
465        def processComment(self, token):
466            # For most phases the following is correct. Where it's not it will be
467            # overridden.
468            self.tree.insertComment(token, self.tree.openElements[-1])
469
470        def processDoctype(self, token):
471            self.parser.parseError("unexpected-doctype")
472
473        def processCharacters(self, token):
474            self.tree.insertText(token["data"])
475
476        def processSpaceCharacters(self, token):
477            self.tree.insertText(token["data"])
478
479        def processStartTag(self, token):
480            return self.startTagHandler[token["name"]](token)
481
482        def startTagHtml(self, token):
483            if not self.parser.firstStartTag and token["name"] == "html":
484                self.parser.parseError("non-html-root")
485            # XXX Need a check here to see if the first start tag token emitted is
486            # this token... If it's not, invoke self.parser.parseError().
487            for attr, value in token["data"].items():
488                if attr not in self.tree.openElements[0].attributes:
489                    self.tree.openElements[0].attributes[attr] = value
490            self.parser.firstStartTag = False
491
492        def processEndTag(self, token):
493            return self.endTagHandler[token["name"]](token)
494
495    class InitialPhase(Phase):
496        def processSpaceCharacters(self, token):
497            pass
498
499        def processComment(self, token):
500            self.tree.insertComment(token, self.tree.document)
501
502        def processDoctype(self, token):
503            name = token["name"]
504            publicId = token["publicId"]
505            systemId = token["systemId"]
506            correct = token["correct"]
507
508            if (name != "html" or publicId is not None or
509                    systemId is not None and systemId != "about:legacy-compat"):
510                self.parser.parseError("unknown-doctype")
511
512            if publicId is None:
513                publicId = ""
514
515            self.tree.insertDoctype(token)
516
517            if publicId != "":
518                publicId = publicId.translate(asciiUpper2Lower)
519
520            if (not correct or token["name"] != "html"
521                or publicId.startswith(
522                    ("+//silmaril//dtd html pro v0r11 19970101//",
523                     "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
524                     "-//as//dtd html 3.0 aswedit + extensions//",
525                     "-//ietf//dtd html 2.0 level 1//",
526                     "-//ietf//dtd html 2.0 level 2//",
527                     "-//ietf//dtd html 2.0 strict level 1//",
528                     "-//ietf//dtd html 2.0 strict level 2//",
529                     "-//ietf//dtd html 2.0 strict//",
530                     "-//ietf//dtd html 2.0//",
531                     "-//ietf//dtd html 2.1e//",
532                     "-//ietf//dtd html 3.0//",
533                     "-//ietf//dtd html 3.2 final//",
534                     "-//ietf//dtd html 3.2//",
535                     "-//ietf//dtd html 3//",
536                     "-//ietf//dtd html level 0//",
537                     "-//ietf//dtd html level 1//",
538                     "-//ietf//dtd html level 2//",
539                     "-//ietf//dtd html level 3//",
540                     "-//ietf//dtd html strict level 0//",
541                     "-//ietf//dtd html strict level 1//",
542                     "-//ietf//dtd html strict level 2//",
543                     "-//ietf//dtd html strict level 3//",
544                     "-//ietf//dtd html strict//",
545                     "-//ietf//dtd html//",
546                     "-//metrius//dtd metrius presentational//",
547                     "-//microsoft//dtd internet explorer 2.0 html strict//",
548                     "-//microsoft//dtd internet explorer 2.0 html//",
549                     "-//microsoft//dtd internet explorer 2.0 tables//",
550                     "-//microsoft//dtd internet explorer 3.0 html strict//",
551                     "-//microsoft//dtd internet explorer 3.0 html//",
552                     "-//microsoft//dtd internet explorer 3.0 tables//",
553                     "-//netscape comm. corp.//dtd html//",
554                     "-//netscape comm. corp.//dtd strict html//",
555                     "-//o'reilly and associates//dtd html 2.0//",
556                     "-//o'reilly and associates//dtd html extended 1.0//",
557                     "-//o'reilly and associates//dtd html extended relaxed 1.0//",
558                     "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
559                     "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
560                     "-//spyglass//dtd html 2.0 extended//",
561                     "-//sq//dtd html 2.0 hotmetal + extensions//",
562                     "-//sun microsystems corp.//dtd hotjava html//",
563                     "-//sun microsystems corp.//dtd hotjava strict html//",
564                     "-//w3c//dtd html 3 1995-03-24//",
565                     "-//w3c//dtd html 3.2 draft//",
566                     "-//w3c//dtd html 3.2 final//",
567                     "-//w3c//dtd html 3.2//",
568                     "-//w3c//dtd html 3.2s draft//",
569                     "-//w3c//dtd html 4.0 frameset//",
570                     "-//w3c//dtd html 4.0 transitional//",
571                     "-//w3c//dtd html experimental 19960712//",
572                     "-//w3c//dtd html experimental 970421//",
573                     "-//w3c//dtd w3 html//",
574                     "-//w3o//dtd w3 html 3.0//",
575                     "-//webtechs//dtd mozilla html 2.0//",
576                     "-//webtechs//dtd mozilla html//"))
577                or publicId in
578                    ("-//w3o//dtd w3 html strict 3.0//en//",
579                     "-/w3c/dtd html 4.0 transitional/en",
580                     "html")
581                or publicId.startswith(
582                    ("-//w3c//dtd html 4.01 frameset//",
583                     "-//w3c//dtd html 4.01 transitional//")) and
584                    systemId is None
585                    or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
586                self.parser.compatMode = "quirks"
587            elif (publicId.startswith(
588                    ("-//w3c//dtd xhtml 1.0 frameset//",
589                     "-//w3c//dtd xhtml 1.0 transitional//"))
590                  or publicId.startswith(
591                      ("-//w3c//dtd html 4.01 frameset//",
592                       "-//w3c//dtd html 4.01 transitional//")) and
593                  systemId is not None):
594                self.parser.compatMode = "limited quirks"
595
596            self.parser.phase = self.parser.phases["beforeHtml"]
597
598        def anythingElse(self):
599            self.parser.compatMode = "quirks"
600            self.parser.phase = self.parser.phases["beforeHtml"]
601
602        def processCharacters(self, token):
603            self.parser.parseError("expected-doctype-but-got-chars")
604            self.anythingElse()
605            return token
606
607        def processStartTag(self, token):
608            self.parser.parseError("expected-doctype-but-got-start-tag",
609                                   {"name": token["name"]})
610            self.anythingElse()
611            return token
612
613        def processEndTag(self, token):
614            self.parser.parseError("expected-doctype-but-got-end-tag",
615                                   {"name": token["name"]})
616            self.anythingElse()
617            return token
618
619        def processEOF(self):
620            self.parser.parseError("expected-doctype-but-got-eof")
621            self.anythingElse()
622            return True
623
624    class BeforeHtmlPhase(Phase):
625        # helper methods
626        def insertHtmlElement(self):
627            self.tree.insertRoot(impliedTagToken("html", "StartTag"))
628            self.parser.phase = self.parser.phases["beforeHead"]
629
630        # other
631        def processEOF(self):
632            self.insertHtmlElement()
633            return True
634
635        def processComment(self, token):
636            self.tree.insertComment(token, self.tree.document)
637
638        def processSpaceCharacters(self, token):
639            pass
640
641        def processCharacters(self, token):
642            self.insertHtmlElement()
643            return token
644
645        def processStartTag(self, token):
646            if token["name"] == "html":
647                self.parser.firstStartTag = True
648            self.insertHtmlElement()
649            return token
650
651        def processEndTag(self, token):
652            if token["name"] not in ("head", "body", "html", "br"):
653                self.parser.parseError("unexpected-end-tag-before-html",
654                                       {"name": token["name"]})
655            else:
656                self.insertHtmlElement()
657                return token
658
659    class BeforeHeadPhase(Phase):
660        def __init__(self, parser, tree):
661            Phase.__init__(self, parser, tree)
662
663            self.startTagHandler = utils.MethodDispatcher([
664                ("html", self.startTagHtml),
665                ("head", self.startTagHead)
666            ])
667            self.startTagHandler.default = self.startTagOther
668
669            self.endTagHandler = utils.MethodDispatcher([
670                (("head", "body", "html", "br"), self.endTagImplyHead)
671            ])
672            self.endTagHandler.default = self.endTagOther
673
674        def processEOF(self):
675            self.startTagHead(impliedTagToken("head", "StartTag"))
676            return True
677
678        def processSpaceCharacters(self, token):
679            pass
680
681        def processCharacters(self, token):
682            self.startTagHead(impliedTagToken("head", "StartTag"))
683            return token
684
685        def startTagHtml(self, token):
686            return self.parser.phases["inBody"].processStartTag(token)
687
688        def startTagHead(self, token):
689            self.tree.insertElement(token)
690            self.tree.headPointer = self.tree.openElements[-1]
691            self.parser.phase = self.parser.phases["inHead"]
692
693        def startTagOther(self, token):
694            self.startTagHead(impliedTagToken("head", "StartTag"))
695            return token
696
697        def endTagImplyHead(self, token):
698            self.startTagHead(impliedTagToken("head", "StartTag"))
699            return token
700
701        def endTagOther(self, token):
702            self.parser.parseError("end-tag-after-implied-root",
703                                   {"name": token["name"]})
704
705    class InHeadPhase(Phase):
706        def __init__(self, parser, tree):
707            Phase.__init__(self, parser, tree)
708
709            self.startTagHandler = utils.MethodDispatcher([
710                ("html", self.startTagHtml),
711                ("title", self.startTagTitle),
712                (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
713                ("script", self.startTagScript),
714                (("base", "basefont", "bgsound", "command", "link"),
715                 self.startTagBaseLinkCommand),
716                ("meta", self.startTagMeta),
717                ("head", self.startTagHead)
718            ])
719            self.startTagHandler.default = self.startTagOther
720
721            self. endTagHandler = utils.MethodDispatcher([
722                ("head", self.endTagHead),
723                (("br", "html", "body"), self.endTagHtmlBodyBr)
724            ])
725            self.endTagHandler.default = self.endTagOther
726
727        # the real thing
728        def processEOF(self):
729            self.anythingElse()
730            return True
731
732        def processCharacters(self, token):
733            self.anythingElse()
734            return token
735
736        def startTagHtml(self, token):
737            return self.parser.phases["inBody"].processStartTag(token)
738
739        def startTagHead(self, token):
740            self.parser.parseError("two-heads-are-not-better-than-one")
741
742        def startTagBaseLinkCommand(self, token):
743            self.tree.insertElement(token)
744            self.tree.openElements.pop()
745            token["selfClosingAcknowledged"] = True
746
747        def startTagMeta(self, token):
748            self.tree.insertElement(token)
749            self.tree.openElements.pop()
750            token["selfClosingAcknowledged"] = True
751
752            attributes = token["data"]
753            if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
754                if "charset" in attributes:
755                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
756                elif ("content" in attributes and
757                      "http-equiv" in attributes and
758                      attributes["http-equiv"].lower() == "content-type"):
759                    # Encoding it as UTF-8 here is a hack, as really we should pass
760                    # the abstract Unicode string, and just use the
761                    # ContentAttrParser on that, but using UTF-8 allows all chars
762                    # to be encoded and as a ASCII-superset works.
763                    data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
764                    parser = inputstream.ContentAttrParser(data)
765                    codec = parser.parse()
766                    self.parser.tokenizer.stream.changeEncoding(codec)
767
768        def startTagTitle(self, token):
769            self.parser.parseRCDataRawtext(token, "RCDATA")
770
771        def startTagNoScriptNoFramesStyle(self, token):
772            # Need to decide whether to implement the scripting-disabled case
773            self.parser.parseRCDataRawtext(token, "RAWTEXT")
774
775        def startTagScript(self, token):
776            self.tree.insertElement(token)
777            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
778            self.parser.originalPhase = self.parser.phase
779            self.parser.phase = self.parser.phases["text"]
780
781        def startTagOther(self, token):
782            self.anythingElse()
783            return token
784
785        def endTagHead(self, token):
786            node = self.parser.tree.openElements.pop()
787            assert node.name == "head", "Expected head got %s" % node.name
788            self.parser.phase = self.parser.phases["afterHead"]
789
790        def endTagHtmlBodyBr(self, token):
791            self.anythingElse()
792            return token
793
794        def endTagOther(self, token):
795            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
796
797        def anythingElse(self):
798            self.endTagHead(impliedTagToken("head"))
799
800    # XXX If we implement a parser for which scripting is disabled we need to
801    # implement this phase.
802    #
803    # class InHeadNoScriptPhase(Phase):
804    class AfterHeadPhase(Phase):
805        def __init__(self, parser, tree):
806            Phase.__init__(self, parser, tree)
807
808            self.startTagHandler = utils.MethodDispatcher([
809                ("html", self.startTagHtml),
810                ("body", self.startTagBody),
811                ("frameset", self.startTagFrameset),
812                (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
813                  "style", "title"),
814                 self.startTagFromHead),
815                ("head", self.startTagHead)
816            ])
817            self.startTagHandler.default = self.startTagOther
818            self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
819                                                          self.endTagHtmlBodyBr)])
820            self.endTagHandler.default = self.endTagOther
821
822        def processEOF(self):
823            self.anythingElse()
824            return True
825
826        def processCharacters(self, token):
827            self.anythingElse()
828            return token
829
830        def startTagHtml(self, token):
831            return self.parser.phases["inBody"].processStartTag(token)
832
833        def startTagBody(self, token):
834            self.parser.framesetOK = False
835            self.tree.insertElement(token)
836            self.parser.phase = self.parser.phases["inBody"]
837
838        def startTagFrameset(self, token):
839            self.tree.insertElement(token)
840            self.parser.phase = self.parser.phases["inFrameset"]
841
842        def startTagFromHead(self, token):
843            self.parser.parseError("unexpected-start-tag-out-of-my-head",
844                                   {"name": token["name"]})
845            self.tree.openElements.append(self.tree.headPointer)
846            self.parser.phases["inHead"].processStartTag(token)
847            for node in self.tree.openElements[::-1]:
848                if node.name == "head":
849                    self.tree.openElements.remove(node)
850                    break
851
852        def startTagHead(self, token):
853            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
854
855        def startTagOther(self, token):
856            self.anythingElse()
857            return token
858
859        def endTagHtmlBodyBr(self, token):
860            self.anythingElse()
861            return token
862
863        def endTagOther(self, token):
864            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
865
866        def anythingElse(self):
867            self.tree.insertElement(impliedTagToken("body", "StartTag"))
868            self.parser.phase = self.parser.phases["inBody"]
869            self.parser.framesetOK = True
870
871    class InBodyPhase(Phase):
872        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
873        # the really-really-really-very crazy mode
874        def __init__(self, parser, tree):
875            Phase.__init__(self, parser, tree)
876
877            # Keep a ref to this for special handling of whitespace in <pre>
878            self.processSpaceCharactersNonPre = self.processSpaceCharacters
879
880            self.startTagHandler = utils.MethodDispatcher([
881                ("html", self.startTagHtml),
882                (("base", "basefont", "bgsound", "command", "link", "meta",
883                  "script", "style", "title"),
884                 self.startTagProcessInHead),
885                ("body", self.startTagBody),
886                ("frameset", self.startTagFrameset),
887                (("address", "article", "aside", "blockquote", "center", "details",
888                  "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
889                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
890                  "section", "summary", "ul"),
891                 self.startTagCloseP),
892                (headingElements, self.startTagHeading),
893                (("pre", "listing"), self.startTagPreListing),
894                ("form", self.startTagForm),
895                (("li", "dd", "dt"), self.startTagListItem),
896                ("plaintext", self.startTagPlaintext),
897                ("a", self.startTagA),
898                (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
899                  "strong", "tt", "u"), self.startTagFormatting),
900                ("nobr", self.startTagNobr),
901                ("button", self.startTagButton),
902                (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
903                ("xmp", self.startTagXmp),
904                ("table", self.startTagTable),
905                (("area", "br", "embed", "img", "keygen", "wbr"),
906                 self.startTagVoidFormatting),
907                (("param", "source", "track"), self.startTagParamSource),
908                ("input", self.startTagInput),
909                ("hr", self.startTagHr),
910                ("image", self.startTagImage),
911                ("isindex", self.startTagIsIndex),
912                ("textarea", self.startTagTextarea),
913                ("iframe", self.startTagIFrame),
914                (("noembed", "noframes", "noscript"), self.startTagRawtext),
915                ("select", self.startTagSelect),
916                (("rp", "rt"), self.startTagRpRt),
917                (("option", "optgroup"), self.startTagOpt),
918                (("math"), self.startTagMath),
919                (("svg"), self.startTagSvg),
920                (("caption", "col", "colgroup", "frame", "head",
921                  "tbody", "td", "tfoot", "th", "thead",
922                  "tr"), self.startTagMisplaced)
923            ])
924            self.startTagHandler.default = self.startTagOther
925
926            self.endTagHandler = utils.MethodDispatcher([
927                ("body", self.endTagBody),
928                ("html", self.endTagHtml),
929                (("address", "article", "aside", "blockquote", "button", "center",
930                  "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
931                  "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
932                  "section", "summary", "ul"), self.endTagBlock),
933                ("form", self.endTagForm),
934                ("p", self.endTagP),
935                (("dd", "dt", "li"), self.endTagListItem),
936                (headingElements, self.endTagHeading),
937                (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
938                  "strike", "strong", "tt", "u"), self.endTagFormatting),
939                (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
940                ("br", self.endTagBr),
941            ])
942            self.endTagHandler.default = self.endTagOther
943
944        def isMatchingFormattingElement(self, node1, node2):
945            if node1.name != node2.name or node1.namespace != node2.namespace:
946                return False
947            elif len(node1.attributes) != len(node2.attributes):
948                return False
949            else:
950                attributes1 = sorted(node1.attributes.items())
951                attributes2 = sorted(node2.attributes.items())
952                for attr1, attr2 in zip(attributes1, attributes2):
953                    if attr1 != attr2:
954                        return False
955            return True
956
957        # helper
958        def addFormattingElement(self, token):
959            self.tree.insertElement(token)
960            element = self.tree.openElements[-1]
961
962            matchingElements = []
963            for node in self.tree.activeFormattingElements[::-1]:
964                if node is Marker:
965                    break
966                elif self.isMatchingFormattingElement(node, element):
967                    matchingElements.append(node)
968
969            assert len(matchingElements) <= 3
970            if len(matchingElements) == 3:
971                self.tree.activeFormattingElements.remove(matchingElements[-1])
972            self.tree.activeFormattingElements.append(element)
973
974        # the real deal
975        def processEOF(self):
976            allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
977                                          "tfoot", "th", "thead", "tr", "body",
978                                          "html"))
979            for node in self.tree.openElements[::-1]:
980                if node.name not in allowed_elements:
981                    self.parser.parseError("expected-closing-tag-but-got-eof")
982                    break
983            # Stop parsing
984
985        def processSpaceCharactersDropNewline(self, token):
986            # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
987            # want to drop leading newlines
988            data = token["data"]
989            self.processSpaceCharacters = self.processSpaceCharactersNonPre
990            if (data.startswith("\n") and
991                self.tree.openElements[-1].name in ("pre", "listing", "textarea")
992                    and not self.tree.openElements[-1].hasContent()):
993                data = data[1:]
994            if data:
995                self.tree.reconstructActiveFormattingElements()
996                self.tree.insertText(data)
997
998        def processCharacters(self, token):
999            if token["data"] == "\u0000":
1000                # The tokenizer should always emit null on its own
1001                return
1002            self.tree.reconstructActiveFormattingElements()
1003            self.tree.insertText(token["data"])
1004            # This must be bad for performance
1005            if (self.parser.framesetOK and
1006                any([char not in spaceCharacters
1007                     for char in token["data"]])):
1008                self.parser.framesetOK = False
1009
1010        def processSpaceCharacters(self, token):
1011            self.tree.reconstructActiveFormattingElements()
1012            self.tree.insertText(token["data"])
1013
1014        def startTagProcessInHead(self, token):
1015            return self.parser.phases["inHead"].processStartTag(token)
1016
1017        def startTagBody(self, token):
1018            self.parser.parseError("unexpected-start-tag", {"name": "body"})
1019            if (len(self.tree.openElements) == 1
1020                    or self.tree.openElements[1].name != "body"):
1021                assert self.parser.innerHTML
1022            else:
1023                self.parser.framesetOK = False
1024                for attr, value in token["data"].items():
1025                    if attr not in self.tree.openElements[1].attributes:
1026                        self.tree.openElements[1].attributes[attr] = value
1027
1028        def startTagFrameset(self, token):
1029            self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1030            if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1031                assert self.parser.innerHTML
1032            elif not self.parser.framesetOK:
1033                pass
1034            else:
1035                if self.tree.openElements[1].parent:
1036                    self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1037                while self.tree.openElements[-1].name != "html":
1038                    self.tree.openElements.pop()
1039                self.tree.insertElement(token)
1040                self.parser.phase = self.parser.phases["inFrameset"]
1041
1042        def startTagCloseP(self, token):
1043            if self.tree.elementInScope("p", variant="button"):
1044                self.endTagP(impliedTagToken("p"))
1045            self.tree.insertElement(token)
1046
1047        def startTagPreListing(self, token):
1048            if self.tree.elementInScope("p", variant="button"):
1049                self.endTagP(impliedTagToken("p"))
1050            self.tree.insertElement(token)
1051            self.parser.framesetOK = False
1052            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1053
1054        def startTagForm(self, token):
1055            if self.tree.formPointer:
1056                self.parser.parseError("unexpected-start-tag", {"name": "form"})
1057            else:
1058                if self.tree.elementInScope("p", variant="button"):
1059                    self.endTagP(impliedTagToken("p"))
1060                self.tree.insertElement(token)
1061                self.tree.formPointer = self.tree.openElements[-1]
1062
1063        def startTagListItem(self, token):
1064            self.parser.framesetOK = False
1065
1066            stopNamesMap = {"li": ["li"],
1067                            "dt": ["dt", "dd"],
1068                            "dd": ["dt", "dd"]}
1069            stopNames = stopNamesMap[token["name"]]
1070            for node in reversed(self.tree.openElements):
1071                if node.name in stopNames:
1072                    self.parser.phase.processEndTag(
1073                        impliedTagToken(node.name, "EndTag"))
1074                    break
1075                if (node.nameTuple in specialElements and
1076                        node.name not in ("address", "div", "p")):
1077                    break
1078
1079            if self.tree.elementInScope("p", variant="button"):
1080                self.parser.phase.processEndTag(
1081                    impliedTagToken("p", "EndTag"))
1082
1083            self.tree.insertElement(token)
1084
1085        def startTagPlaintext(self, token):
1086            if self.tree.elementInScope("p", variant="button"):
1087                self.endTagP(impliedTagToken("p"))
1088            self.tree.insertElement(token)
1089            self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1090
1091        def startTagHeading(self, token):
1092            if self.tree.elementInScope("p", variant="button"):
1093                self.endTagP(impliedTagToken("p"))
1094            if self.tree.openElements[-1].name in headingElements:
1095                self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1096                self.tree.openElements.pop()
1097            self.tree.insertElement(token)
1098
1099        def startTagA(self, token):
1100            afeAElement = self.tree.elementInActiveFormattingElements("a")
1101            if afeAElement:
1102                self.parser.parseError("unexpected-start-tag-implies-end-tag",
1103                                       {"startName": "a", "endName": "a"})
1104                self.endTagFormatting(impliedTagToken("a"))
1105                if afeAElement in self.tree.openElements:
1106                    self.tree.openElements.remove(afeAElement)
1107                if afeAElement in self.tree.activeFormattingElements:
1108                    self.tree.activeFormattingElements.remove(afeAElement)
1109            self.tree.reconstructActiveFormattingElements()
1110            self.addFormattingElement(token)
1111
1112        def startTagFormatting(self, token):
1113            self.tree.reconstructActiveFormattingElements()
1114            self.addFormattingElement(token)
1115
1116        def startTagNobr(self, token):
1117            self.tree.reconstructActiveFormattingElements()
1118            if self.tree.elementInScope("nobr"):
1119                self.parser.parseError("unexpected-start-tag-implies-end-tag",
1120                                       {"startName": "nobr", "endName": "nobr"})
1121                self.processEndTag(impliedTagToken("nobr"))
1122                # XXX Need tests that trigger the following
1123                self.tree.reconstructActiveFormattingElements()
1124            self.addFormattingElement(token)
1125
1126        def startTagButton(self, token):
1127            if self.tree.elementInScope("button"):
1128                self.parser.parseError("unexpected-start-tag-implies-end-tag",
1129                                       {"startName": "button", "endName": "button"})
1130                self.processEndTag(impliedTagToken("button"))
1131                return token
1132            else:
1133                self.tree.reconstructActiveFormattingElements()
1134                self.tree.insertElement(token)
1135                self.parser.framesetOK = False
1136
1137        def startTagAppletMarqueeObject(self, token):
1138            self.tree.reconstructActiveFormattingElements()
1139            self.tree.insertElement(token)
1140            self.tree.activeFormattingElements.append(Marker)
1141            self.parser.framesetOK = False
1142
1143        def startTagXmp(self, token):
1144            if self.tree.elementInScope("p", variant="button"):
1145                self.endTagP(impliedTagToken("p"))
1146            self.tree.reconstructActiveFormattingElements()
1147            self.parser.framesetOK = False
1148            self.parser.parseRCDataRawtext(token, "RAWTEXT")
1149
1150        def startTagTable(self, token):
1151            if self.parser.compatMode != "quirks":
1152                if self.tree.elementInScope("p", variant="button"):
1153                    self.processEndTag(impliedTagToken("p"))
1154            self.tree.insertElement(token)
1155            self.parser.framesetOK = False
1156            self.parser.phase = self.parser.phases["inTable"]
1157
1158        def startTagVoidFormatting(self, token):
1159            self.tree.reconstructActiveFormattingElements()
1160            self.tree.insertElement(token)
1161            self.tree.openElements.pop()
1162            token["selfClosingAcknowledged"] = True
1163            self.parser.framesetOK = False
1164
1165        def startTagInput(self, token):
1166            framesetOK = self.parser.framesetOK
1167            self.startTagVoidFormatting(token)
1168            if ("type" in token["data"] and
1169                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1170                # input type=hidden doesn't change framesetOK
1171                self.parser.framesetOK = framesetOK
1172
1173        def startTagParamSource(self, token):
1174            self.tree.insertElement(token)
1175            self.tree.openElements.pop()
1176            token["selfClosingAcknowledged"] = True
1177
1178        def startTagHr(self, token):
1179            if self.tree.elementInScope("p", variant="button"):
1180                self.endTagP(impliedTagToken("p"))
1181            self.tree.insertElement(token)
1182            self.tree.openElements.pop()
1183            token["selfClosingAcknowledged"] = True
1184            self.parser.framesetOK = False
1185
1186        def startTagImage(self, token):
1187            # No really...
1188            self.parser.parseError("unexpected-start-tag-treated-as",
1189                                   {"originalName": "image", "newName": "img"})
1190            self.processStartTag(impliedTagToken("img", "StartTag",
1191                                                 attributes=token["data"],
1192                                                 selfClosing=token["selfClosing"]))
1193
1194        def startTagIsIndex(self, token):
1195            self.parser.parseError("deprecated-tag", {"name": "isindex"})
1196            if self.tree.formPointer:
1197                return
1198            form_attrs = {}
1199            if "action" in token["data"]:
1200                form_attrs["action"] = token["data"]["action"]
1201            self.processStartTag(impliedTagToken("form", "StartTag",
1202                                                 attributes=form_attrs))
1203            self.processStartTag(impliedTagToken("hr", "StartTag"))
1204            self.processStartTag(impliedTagToken("label", "StartTag"))
1205            # XXX Localization ...
1206            if "prompt" in token["data"]:
1207                prompt = token["data"]["prompt"]
1208            else:
1209                prompt = "This is a searchable index. Enter search keywords: "
1210            self.processCharacters(
1211                {"type": tokenTypes["Characters"], "data": prompt})
1212            attributes = token["data"].copy()
1213            if "action" in attributes:
1214                del attributes["action"]
1215            if "prompt" in attributes:
1216                del attributes["prompt"]
1217            attributes["name"] = "isindex"
1218            self.processStartTag(impliedTagToken("input", "StartTag",
1219                                                 attributes=attributes,
1220                                                 selfClosing=token["selfClosing"]))
1221            self.processEndTag(impliedTagToken("label"))
1222            self.processStartTag(impliedTagToken("hr", "StartTag"))
1223            self.processEndTag(impliedTagToken("form"))
1224
1225        def startTagTextarea(self, token):
1226            self.tree.insertElement(token)
1227            self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1228            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1229            self.parser.framesetOK = False
1230
1231        def startTagIFrame(self, token):
1232            self.parser.framesetOK = False
1233            self.startTagRawtext(token)
1234
1235        def startTagRawtext(self, token):
1236            """iframe, noembed noframes, noscript(if scripting enabled)"""
1237            self.parser.parseRCDataRawtext(token, "RAWTEXT")
1238
1239        def startTagOpt(self, token):
1240            if self.tree.openElements[-1].name == "option":
1241                self.parser.phase.processEndTag(impliedTagToken("option"))
1242            self.tree.reconstructActiveFormattingElements()
1243            self.parser.tree.insertElement(token)
1244
1245        def startTagSelect(self, token):
1246            self.tree.reconstructActiveFormattingElements()
1247            self.tree.insertElement(token)
1248            self.parser.framesetOK = False
1249            if self.parser.phase in (self.parser.phases["inTable"],
1250                                     self.parser.phases["inCaption"],
1251                                     self.parser.phases["inColumnGroup"],
1252                                     self.parser.phases["inTableBody"],
1253                                     self.parser.phases["inRow"],
1254                                     self.parser.phases["inCell"]):
1255                self.parser.phase = self.parser.phases["inSelectInTable"]
1256            else:
1257                self.parser.phase = self.parser.phases["inSelect"]
1258
1259        def startTagRpRt(self, token):
1260            if self.tree.elementInScope("ruby"):
1261                self.tree.generateImpliedEndTags()
1262                if self.tree.openElements[-1].name != "ruby":
1263                    self.parser.parseError()
1264            self.tree.insertElement(token)
1265
1266        def startTagMath(self, token):
1267            self.tree.reconstructActiveFormattingElements()
1268            self.parser.adjustMathMLAttributes(token)
1269            self.parser.adjustForeignAttributes(token)
1270            token["namespace"] = namespaces["mathml"]
1271            self.tree.insertElement(token)
1272            # Need to get the parse error right for the case where the token
1273            # has a namespace not equal to the xmlns attribute
1274            if token["selfClosing"]:
1275                self.tree.openElements.pop()
1276                token["selfClosingAcknowledged"] = True
1277
1278        def startTagSvg(self, token):
1279            self.tree.reconstructActiveFormattingElements()
1280            self.parser.adjustSVGAttributes(token)
1281            self.parser.adjustForeignAttributes(token)
1282            token["namespace"] = namespaces["svg"]
1283            self.tree.insertElement(token)
1284            # Need to get the parse error right for the case where the token
1285            # has a namespace not equal to the xmlns attribute
1286            if token["selfClosing"]:
1287                self.tree.openElements.pop()
1288                token["selfClosingAcknowledged"] = True
1289
1290        def startTagMisplaced(self, token):
1291            """ Elements that should be children of other elements that have a
1292            different insertion mode; here they are ignored
1293            "caption", "col", "colgroup", "frame", "frameset", "head",
1294            "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1295            "tr", "noscript"
1296            """
1297            self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1298
1299        def startTagOther(self, token):
1300            self.tree.reconstructActiveFormattingElements()
1301            self.tree.insertElement(token)
1302
1303        def endTagP(self, token):
1304            if not self.tree.elementInScope("p", variant="button"):
1305                self.startTagCloseP(impliedTagToken("p", "StartTag"))
1306                self.parser.parseError("unexpected-end-tag", {"name": "p"})
1307                self.endTagP(impliedTagToken("p", "EndTag"))
1308            else:
1309                self.tree.generateImpliedEndTags("p")
1310                if self.tree.openElements[-1].name != "p":
1311                    self.parser.parseError("unexpected-end-tag", {"name": "p"})
1312                node = self.tree.openElements.pop()
1313                while node.name != "p":
1314                    node = self.tree.openElements.pop()
1315
1316        def endTagBody(self, token):
1317            if not self.tree.elementInScope("body"):
1318                self.parser.parseError()
1319                return
1320            elif self.tree.openElements[-1].name != "body":
1321                for node in self.tree.openElements[2:]:
1322                    if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1323                                                   "option", "p", "rp", "rt",
1324                                                   "tbody", "td", "tfoot",
1325                                                   "th", "thead", "tr", "body",
1326                                                   "html")):
1327                        # Not sure this is the correct name for the parse error
1328                        self.parser.parseError(
1329                            "expected-one-end-tag-but-got-another",
1330                            {"expectedName": "body", "gotName": node.name})
1331                        break
1332            self.parser.phase = self.parser.phases["afterBody"]
1333
1334        def endTagHtml(self, token):
1335            # We repeat the test for the body end tag token being ignored here
1336            if self.tree.elementInScope("body"):
1337                self.endTagBody(impliedTagToken("body"))
1338                return token
1339
1340        def endTagBlock(self, token):
1341            # Put us back in the right whitespace handling mode
1342            if token["name"] == "pre":
1343                self.processSpaceCharacters = self.processSpaceCharactersNonPre
1344            inScope = self.tree.elementInScope(token["name"])
1345            if inScope:
1346                self.tree.generateImpliedEndTags()
1347            if self.tree.openElements[-1].name != token["name"]:
1348                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1349            if inScope:
1350                node = self.tree.openElements.pop()
1351                while node.name != token["name"]:
1352                    node = self.tree.openElements.pop()
1353
1354        def endTagForm(self, token):
1355            node = self.tree.formPointer
1356            self.tree.formPointer = None
1357            if node is None or not self.tree.elementInScope(node):
1358                self.parser.parseError("unexpected-end-tag",
1359                                       {"name": "form"})
1360            else:
1361                self.tree.generateImpliedEndTags()
1362                if self.tree.openElements[-1] != node:
1363                    self.parser.parseError("end-tag-too-early-ignored",
1364                                           {"name": "form"})
1365                self.tree.openElements.remove(node)
1366
1367        def endTagListItem(self, token):
1368            if token["name"] == "li":
1369                variant = "list"
1370            else:
1371                variant = None
1372            if not self.tree.elementInScope(token["name"], variant=variant):
1373                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1374            else:
1375                self.tree.generateImpliedEndTags(exclude=token["name"])
1376                if self.tree.openElements[-1].name != token["name"]:
1377                    self.parser.parseError(
1378                        "end-tag-too-early",
1379                        {"name": token["name"]})
1380                node = self.tree.openElements.pop()
1381                while node.name != token["name"]:
1382                    node = self.tree.openElements.pop()
1383
1384        def endTagHeading(self, token):
1385            for item in headingElements:
1386                if self.tree.elementInScope(item):
1387                    self.tree.generateImpliedEndTags()
1388                    break
1389            if self.tree.openElements[-1].name != token["name"]:
1390                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1391
1392            for item in headingElements:
1393                if self.tree.elementInScope(item):
1394                    item = self.tree.openElements.pop()
1395                    while item.name not in headingElements:
1396                        item = self.tree.openElements.pop()
1397                    break
1398
1399        def endTagFormatting(self, token):
1400            """The much-feared adoption agency algorithm"""
1401            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1402            # XXX Better parseError messages appreciated.
1403
1404            # Step 1
1405            outerLoopCounter = 0
1406
1407            # Step 2
1408            while outerLoopCounter < 8:
1409
1410                # Step 3
1411                outerLoopCounter += 1
1412
1413                # Step 4:
1414
1415                # Let the formatting element be the last element in
1416                # the list of active formatting elements that:
1417                # - is between the end of the list and the last scope
1418                # marker in the list, if any, or the start of the list
1419                # otherwise, and
1420                # - has the same tag name as the token.
1421                formattingElement = self.tree.elementInActiveFormattingElements(
1422                    token["name"])
1423                if (not formattingElement or
1424                    (formattingElement in self.tree.openElements and
1425                     not self.tree.elementInScope(formattingElement.name))):
1426                    # If there is no such node, then abort these steps
1427                    # and instead act as described in the "any other
1428                    # end tag" entry below.
1429                    self.endTagOther(token)
1430                    return
1431
1432                # Otherwise, if there is such a node, but that node is
1433                # not in the stack of open elements, then this is a
1434                # parse error; remove the element from the list, and
1435                # abort these steps.
1436                elif formattingElement not in self.tree.openElements:
1437                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1438                    self.tree.activeFormattingElements.remove(formattingElement)
1439                    return
1440
1441                # Otherwise, if there is such a node, and that node is
1442                # also in the stack of open elements, but the element
1443                # is not in scope, then this is a parse error; ignore
1444                # the token, and abort these steps.
1445                elif not self.tree.elementInScope(formattingElement.name):
1446                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1447                    return
1448
1449                # Otherwise, there is a formatting element and that
1450                # element is in the stack and is in scope. If the
1451                # element is not the current node, this is a parse
1452                # error. In any case, proceed with the algorithm as
1453                # written in the following steps.
1454                else:
1455                    if formattingElement != self.tree.openElements[-1]:
1456                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1457
1458                # Step 5:
1459
1460                # Let the furthest block be the topmost node in the
1461                # stack of open elements that is lower in the stack
1462                # than the formatting element, and is an element in
1463                # the special category. There might not be one.
1464                afeIndex = self.tree.openElements.index(formattingElement)
1465                furthestBlock = None
1466                for element in self.tree.openElements[afeIndex:]:
1467                    if element.nameTuple in specialElements:
1468                        furthestBlock = element
1469                        break
1470
1471                # Step 6:
1472
1473                # If there is no furthest block, then the UA must
1474                # first pop all the nodes from the bottom of the stack
1475                # of open elements, from the current node up to and
1476                # including the formatting element, then remove the
1477                # formatting element from the list of active
1478                # formatting elements, and finally abort these steps.
1479                if furthestBlock is None:
1480                    element = self.tree.openElements.pop()
1481                    while element != formattingElement:
1482                        element = self.tree.openElements.pop()
1483                    self.tree.activeFormattingElements.remove(element)
1484                    return
1485
1486                # Step 7
1487                commonAncestor = self.tree.openElements[afeIndex - 1]
1488
1489                # Step 8:
1490                # The bookmark is supposed to help us identify where to reinsert
1491                # nodes in step 15. We have to ensure that we reinsert nodes after
1492                # the node before the active formatting element. Note the bookmark
1493                # can move in step 9.7
1494                bookmark = self.tree.activeFormattingElements.index(formattingElement)
1495
1496                # Step 9
1497                lastNode = node = furthestBlock
1498                innerLoopCounter = 0
1499
1500                index = self.tree.openElements.index(node)
1501                while innerLoopCounter < 3:
1502                    innerLoopCounter += 1
1503                    # Node is element before node in open elements
1504                    index -= 1
1505                    node = self.tree.openElements[index]
1506                    if node not in self.tree.activeFormattingElements:
1507                        self.tree.openElements.remove(node)
1508                        continue
1509                    # Step 9.6
1510                    if node == formattingElement:
1511                        break
1512                    # Step 9.7
1513                    if lastNode == furthestBlock:
1514                        bookmark = self.tree.activeFormattingElements.index(node) + 1
1515                    # Step 9.8
1516                    clone = node.cloneNode()
1517                    # Replace node with clone
1518                    self.tree.activeFormattingElements[
1519                        self.tree.activeFormattingElements.index(node)] = clone
1520                    self.tree.openElements[
1521                        self.tree.openElements.index(node)] = clone
1522                    node = clone
1523                    # Step 9.9
1524                    # Remove lastNode from its parents, if any
1525                    if lastNode.parent:
1526                        lastNode.parent.removeChild(lastNode)
1527                    node.appendChild(lastNode)
1528                    # Step 9.10
1529                    lastNode = node
1530
1531                # Step 10
1532                # Foster parent lastNode if commonAncestor is a
1533                # table, tbody, tfoot, thead, or tr we need to foster
1534                # parent the lastNode
1535                if lastNode.parent:
1536                    lastNode.parent.removeChild(lastNode)
1537
1538                if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1539                    parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1540                    parent.insertBefore(lastNode, insertBefore)
1541                else:
1542                    commonAncestor.appendChild(lastNode)
1543
1544                # Step 11
1545                clone = formattingElement.cloneNode()
1546
1547                # Step 12
1548                furthestBlock.reparentChildren(clone)
1549
1550                # Step 13
1551                furthestBlock.appendChild(clone)
1552
1553                # Step 14
1554                self.tree.activeFormattingElements.remove(formattingElement)
1555                self.tree.activeFormattingElements.insert(bookmark, clone)
1556
1557                # Step 15
1558                self.tree.openElements.remove(formattingElement)
1559                self.tree.openElements.insert(
1560                    self.tree.openElements.index(furthestBlock) + 1, clone)
1561
1562        def endTagAppletMarqueeObject(self, token):
1563            if self.tree.elementInScope(token["name"]):
1564                self.tree.generateImpliedEndTags()
1565            if self.tree.openElements[-1].name != token["name"]:
1566                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1567
1568            if self.tree.elementInScope(token["name"]):
1569                element = self.tree.openElements.pop()
1570                while element.name != token["name"]:
1571                    element = self.tree.openElements.pop()
1572                self.tree.clearActiveFormattingElements()
1573
1574        def endTagBr(self, token):
1575            self.parser.parseError("unexpected-end-tag-treated-as",
1576                                   {"originalName": "br", "newName": "br element"})
1577            self.tree.reconstructActiveFormattingElements()
1578            self.tree.insertElement(impliedTagToken("br", "StartTag"))
1579            self.tree.openElements.pop()
1580
1581        def endTagOther(self, token):
1582            for node in self.tree.openElements[::-1]:
1583                if node.name == token["name"]:
1584                    self.tree.generateImpliedEndTags(exclude=token["name"])
1585                    if self.tree.openElements[-1].name != token["name"]:
1586                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1587                    while self.tree.openElements.pop() != node:
1588                        pass
1589                    break
1590                else:
1591                    if node.nameTuple in specialElements:
1592                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1593                        break
1594
1595    class TextPhase(Phase):
1596        def __init__(self, parser, tree):
1597            Phase.__init__(self, parser, tree)
1598            self.startTagHandler = utils.MethodDispatcher([])
1599            self.startTagHandler.default = self.startTagOther
1600            self.endTagHandler = utils.MethodDispatcher([
1601                ("script", self.endTagScript)])
1602            self.endTagHandler.default = self.endTagOther
1603
1604        def processCharacters(self, token):
1605            self.tree.insertText(token["data"])
1606
1607        def processEOF(self):
1608            self.parser.parseError("expected-named-closing-tag-but-got-eof",
1609                                   {"name": self.tree.openElements[-1].name})
1610            self.tree.openElements.pop()
1611            self.parser.phase = self.parser.originalPhase
1612            return True
1613
1614        def startTagOther(self, token):
1615            assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1616
1617        def endTagScript(self, token):
1618            node = self.tree.openElements.pop()
1619            assert node.name == "script"
1620            self.parser.phase = self.parser.originalPhase
1621            # The rest of this method is all stuff that only happens if
1622            # document.write works
1623
1624        def endTagOther(self, token):
1625            self.tree.openElements.pop()
1626            self.parser.phase = self.parser.originalPhase
1627
1628    class InTablePhase(Phase):
1629        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1630        def __init__(self, parser, tree):
1631            Phase.__init__(self, parser, tree)
1632            self.startTagHandler = utils.MethodDispatcher([
1633                ("html", self.startTagHtml),
1634                ("caption", self.startTagCaption),
1635                ("colgroup", self.startTagColgroup),
1636                ("col", self.startTagCol),
1637                (("tbody", "tfoot", "thead"), self.startTagRowGroup),
1638                (("td", "th", "tr"), self.startTagImplyTbody),
1639                ("table", self.startTagTable),
1640                (("style", "script"), self.startTagStyleScript),
1641                ("input", self.startTagInput),
1642                ("form", self.startTagForm)
1643            ])
1644            self.startTagHandler.default = self.startTagOther
1645
1646            self.endTagHandler = utils.MethodDispatcher([
1647                ("table", self.endTagTable),
1648                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1649                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
1650            ])
1651            self.endTagHandler.default = self.endTagOther
1652
1653        # helper methods
1654        def clearStackToTableContext(self):
1655            # "clear the stack back to a table context"
1656            while self.tree.openElements[-1].name not in ("table", "html"):
1657                # self.parser.parseError("unexpected-implied-end-tag-in-table",
1658                #  {"name":  self.tree.openElements[-1].name})
1659                self.tree.openElements.pop()
1660            # When the current node is <html> it's an innerHTML case
1661
1662        # processing methods
1663        def processEOF(self):
1664            if self.tree.openElements[-1].name != "html":
1665                self.parser.parseError("eof-in-table")
1666            else:
1667                assert self.parser.innerHTML
1668            # Stop parsing
1669
1670        def processSpaceCharacters(self, token):
1671            originalPhase = self.parser.phase
1672            self.parser.phase = self.parser.phases["inTableText"]
1673            self.parser.phase.originalPhase = originalPhase
1674            self.parser.phase.processSpaceCharacters(token)
1675
1676        def processCharacters(self, token):
1677            originalPhase = self.parser.phase
1678            self.parser.phase = self.parser.phases["inTableText"]
1679            self.parser.phase.originalPhase = originalPhase
1680            self.parser.phase.processCharacters(token)
1681
1682        def insertText(self, token):
1683            # If we get here there must be at least one non-whitespace character
1684            # Do the table magic!
1685            self.tree.insertFromTable = True
1686            self.parser.phases["inBody"].processCharacters(token)
1687            self.tree.insertFromTable = False
1688
1689        def startTagCaption(self, token):
1690            self.clearStackToTableContext()
1691            self.tree.activeFormattingElements.append(Marker)
1692            self.tree.insertElement(token)
1693            self.parser.phase = self.parser.phases["inCaption"]
1694
1695        def startTagColgroup(self, token):
1696            self.clearStackToTableContext()
1697            self.tree.insertElement(token)
1698            self.parser.phase = self.parser.phases["inColumnGroup"]
1699
1700        def startTagCol(self, token):
1701            self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1702            return token
1703
1704        def startTagRowGroup(self, token):
1705            self.clearStackToTableContext()
1706            self.tree.insertElement(token)
1707            self.parser.phase = self.parser.phases["inTableBody"]
1708
1709        def startTagImplyTbody(self, token):
1710            self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1711            return token
1712
1713        def startTagTable(self, token):
1714            self.parser.parseError("unexpected-start-tag-implies-end-tag",
1715                                   {"startName": "table", "endName": "table"})
1716            self.parser.phase.processEndTag(impliedTagToken("table"))
1717            if not self.parser.innerHTML:
1718                return token
1719
1720        def startTagStyleScript(self, token):
1721            return self.parser.phases["inHead"].processStartTag(token)
1722
1723        def startTagInput(self, token):
1724            if ("type" in token["data"] and
1725                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1726                self.parser.parseError("unexpected-hidden-input-in-table")
1727                self.tree.insertElement(token)
1728                # XXX associate with form
1729                self.tree.openElements.pop()
1730            else:
1731                self.startTagOther(token)
1732
1733        def startTagForm(self, token):
1734            self.parser.parseError("unexpected-form-in-table")
1735            if self.tree.formPointer is None:
1736                self.tree.insertElement(token)
1737                self.tree.formPointer = self.tree.openElements[-1]
1738                self.tree.openElements.pop()
1739
1740        def startTagOther(self, token):
1741            self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1742            # Do the table magic!
1743            self.tree.insertFromTable = True
1744            self.parser.phases["inBody"].processStartTag(token)
1745            self.tree.insertFromTable = False
1746
1747        def endTagTable(self, token):
1748            if self.tree.elementInScope("table", variant="table"):
1749                self.tree.generateImpliedEndTags()
1750                if self.tree.openElements[-1].name != "table":
1751                    self.parser.parseError("end-tag-too-early-named",
1752                                           {"gotName": "table",
1753                                            "expectedName": self.tree.openElements[-1].name})
1754                while self.tree.openElements[-1].name != "table":
1755                    self.tree.openElements.pop()
1756                self.tree.openElements.pop()
1757                self.parser.resetInsertionMode()
1758            else:
1759                # innerHTML case
1760                assert self.parser.innerHTML
1761                self.parser.parseError()
1762
1763        def endTagIgnore(self, token):
1764            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1765
1766        def endTagOther(self, token):
1767            self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1768            # Do the table magic!
1769            self.tree.insertFromTable = True
1770            self.parser.phases["inBody"].processEndTag(token)
1771            self.tree.insertFromTable = False
1772
1773    class InTableTextPhase(Phase):
1774        def __init__(self, parser, tree):
1775            Phase.__init__(self, parser, tree)
1776            self.originalPhase = None
1777            self.characterTokens = []
1778
1779        def flushCharacters(self):
1780            data = "".join([item["data"] for item in self.characterTokens])
1781            if any([item not in spaceCharacters for item in data]):
1782                token = {"type": tokenTypes["Characters"], "data": data}
1783                self.parser.phases["inTable"].insertText(token)
1784            elif data:
1785                self.tree.insertText(data)
1786            self.characterTokens = []
1787
1788        def processComment(self, token):
1789            self.flushCharacters()
1790            self.parser.phase = self.originalPhase
1791            return token
1792
1793        def processEOF(self):
1794            self.flushCharacters()
1795            self.parser.phase = self.originalPhase
1796            return True
1797
1798        def processCharacters(self, token):
1799            if token["data"] == "\u0000":
1800                return
1801            self.characterTokens.append(token)
1802
1803        def processSpaceCharacters(self, token):
1804            # pretty sure we should never reach here
1805            self.characterTokens.append(token)
1806    #        assert False
1807
1808        def processStartTag(self, token):
1809            self.flushCharacters()
1810            self.parser.phase = self.originalPhase
1811            return token
1812
1813        def processEndTag(self, token):
1814            self.flushCharacters()
1815            self.parser.phase = self.originalPhase
1816            return token
1817
1818    class InCaptionPhase(Phase):
1819        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1820        def __init__(self, parser, tree):
1821            Phase.__init__(self, parser, tree)
1822
1823            self.startTagHandler = utils.MethodDispatcher([
1824                ("html", self.startTagHtml),
1825                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1826                  "thead", "tr"), self.startTagTableElement)
1827            ])
1828            self.startTagHandler.default = self.startTagOther
1829
1830            self.endTagHandler = utils.MethodDispatcher([
1831                ("caption", self.endTagCaption),
1832                ("table", self.endTagTable),
1833                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1834                  "thead", "tr"), self.endTagIgnore)
1835            ])
1836            self.endTagHandler.default = self.endTagOther
1837
1838        def ignoreEndTagCaption(self):
1839            return not self.tree.elementInScope("caption", variant="table")
1840
1841        def processEOF(self):
1842            self.parser.phases["inBody"].processEOF()
1843
1844        def processCharacters(self, token):
1845            return self.parser.phases["inBody"].processCharacters(token)
1846
1847        def startTagTableElement(self, token):
1848            self.parser.parseError()
1849            # XXX Have to duplicate logic here to find out if the tag is ignored
1850            ignoreEndTag = self.ignoreEndTagCaption()
1851            self.parser.phase.processEndTag(impliedTagToken("caption"))
1852            if not ignoreEndTag:
1853                return token
1854
1855        def startTagOther(self, token):
1856            return self.parser.phases["inBody"].processStartTag(token)
1857
1858        def endTagCaption(self, token):
1859            if not self.ignoreEndTagCaption():
1860                # AT this code is quite similar to endTagTable in "InTable"
1861                self.tree.generateImpliedEndTags()
1862                if self.tree.openElements[-1].name != "caption":
1863                    self.parser.parseError("expected-one-end-tag-but-got-another",
1864                                           {"gotName": "caption",
1865                                            "expectedName": self.tree.openElements[-1].name})
1866                while self.tree.openElements[-1].name != "caption":
1867                    self.tree.openElements.pop()
1868                self.tree.openElements.pop()
1869                self.tree.clearActiveFormattingElements()
1870                self.parser.phase = self.parser.phases["inTable"]
1871            else:
1872                # innerHTML case
1873                assert self.parser.innerHTML
1874                self.parser.parseError()
1875
1876        def endTagTable(self, token):
1877            self.parser.parseError()
1878            ignoreEndTag = self.ignoreEndTagCaption()
1879            self.parser.phase.processEndTag(impliedTagToken("caption"))
1880            if not ignoreEndTag:
1881                return token
1882
1883        def endTagIgnore(self, token):
1884            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1885
1886        def endTagOther(self, token):
1887            return self.parser.phases["inBody"].processEndTag(token)
1888
1889    class InColumnGroupPhase(Phase):
1890        # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1891
1892        def __init__(self, parser, tree):
1893            Phase.__init__(self, parser, tree)
1894
1895            self.startTagHandler = utils.MethodDispatcher([
1896                ("html", self.startTagHtml),
1897                ("col", self.startTagCol)
1898            ])
1899            self.startTagHandler.default = self.startTagOther
1900
1901            self.endTagHandler = utils.MethodDispatcher([
1902                ("colgroup", self.endTagColgroup),
1903                ("col", self.endTagCol)
1904            ])
1905            self.endTagHandler.default = self.endTagOther
1906
1907        def ignoreEndTagColgroup(self):
1908            return self.tree.openElements[-1].name == "html"
1909
1910        def processEOF(self):
1911            if self.tree.openElements[-1].name == "html":
1912                assert self.parser.innerHTML
1913                return
1914            else:
1915                ignoreEndTag = self.ignoreEndTagColgroup()
1916                self.endTagColgroup(impliedTagToken("colgroup"))
1917                if not ignoreEndTag:
1918                    return True
1919
1920        def processCharacters(self, token):
1921            ignoreEndTag = self.ignoreEndTagColgroup()
1922            self.endTagColgroup(impliedTagToken("colgroup"))
1923            if not ignoreEndTag:
1924                return token
1925
1926        def startTagCol(self, token):
1927            self.tree.insertElement(token)
1928            self.tree.openElements.pop()
1929
1930        def startTagOther(self, token):
1931            ignoreEndTag = self.ignoreEndTagColgroup()
1932            self.endTagColgroup(impliedTagToken("colgroup"))
1933            if not ignoreEndTag:
1934                return token
1935
1936        def endTagColgroup(self, token):
1937            if self.ignoreEndTagColgroup():
1938                # innerHTML case
1939                assert self.parser.innerHTML
1940                self.parser.parseError()
1941            else:
1942                self.tree.openElements.pop()
1943                self.parser.phase = self.parser.phases["inTable"]
1944
1945        def endTagCol(self, token):
1946            self.parser.parseError("no-end-tag", {"name": "col"})
1947
1948        def endTagOther(self, token):
1949            ignoreEndTag = self.ignoreEndTagColgroup()
1950            self.endTagColgroup(impliedTagToken("colgroup"))
1951            if not ignoreEndTag:
1952                return token
1953
1954    class InTableBodyPhase(Phase):
1955        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
1956        def __init__(self, parser, tree):
1957            Phase.__init__(self, parser, tree)
1958            self.startTagHandler = utils.MethodDispatcher([
1959                ("html", self.startTagHtml),
1960                ("tr", self.startTagTr),
1961                (("td", "th"), self.startTagTableCell),
1962                (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
1963                 self.startTagTableOther)
1964            ])
1965            self.startTagHandler.default = self.startTagOther
1966
1967            self.endTagHandler = utils.MethodDispatcher([
1968                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
1969                ("table", self.endTagTable),
1970                (("body", "caption", "col", "colgroup", "html", "td", "th",
1971                  "tr"), self.endTagIgnore)
1972            ])
1973            self.endTagHandler.default = self.endTagOther
1974
1975        # helper methods
1976        def clearStackToTableBodyContext(self):
1977            while self.tree.openElements[-1].name not in ("tbody", "tfoot",
1978                                                          "thead", "html"):
1979                # self.parser.parseError("unexpected-implied-end-tag-in-table",
1980                #  {"name": self.tree.openElements[-1].name})
1981                self.tree.openElements.pop()
1982            if self.tree.openElements[-1].name == "html":
1983                assert self.parser.innerHTML
1984
1985        # the rest
1986        def processEOF(self):
1987            self.parser.phases["inTable"].processEOF()
1988
1989        def processSpaceCharacters(self, token):
1990            return self.parser.phases["inTable"].processSpaceCharacters(token)
1991
1992        def processCharacters(self, token):
1993            return self.parser.phases["inTable"].processCharacters(token)
1994
1995        def startTagTr(self, token):
1996            self.clearStackToTableBodyContext()
1997            self.tree.insertElement(token)
1998            self.parser.phase = self.parser.phases["inRow"]
1999
2000        def startTagTableCell(self, token):
2001            self.parser.parseError("unexpected-cell-in-table-body",
2002                                   {"name": token["name"]})
2003            self.startTagTr(impliedTagToken("tr", "StartTag"))
2004            return token
2005
2006        def startTagTableOther(self, token):
2007            # XXX AT Any ideas on how to share this with endTagTable?
2008            if (self.tree.elementInScope("tbody", variant="table") or
2009                self.tree.elementInScope("thead", variant="table") or
2010                    self.tree.elementInScope("tfoot", variant="table")):
2011                self.clearStackToTableBodyContext()
2012                self.endTagTableRowGroup(
2013                    impliedTagToken(self.tree.openElements[-1].name))
2014                return token
2015            else:
2016                # innerHTML case
2017                assert self.parser.innerHTML
2018                self.parser.parseError()
2019
2020        def startTagOther(self, token):
2021            return self.parser.phases["inTable"].processStartTag(token)
2022
2023        def endTagTableRowGroup(self, token):
2024            if self.tree.elementInScope(token["name"], variant="table"):
2025                self.clearStackToTableBodyContext()
2026                self.tree.openElements.pop()
2027                self.parser.phase = self.parser.phases["inTable"]
2028            else:
2029                self.parser.parseError("unexpected-end-tag-in-table-body",
2030                                       {"name": token["name"]})
2031
2032        def endTagTable(self, token):
2033            if (self.tree.elementInScope("tbody", variant="table") or
2034                self.tree.elementInScope("thead", variant="table") or
2035                    self.tree.elementInScope("tfoot", variant="table")):
2036                self.clearStackToTableBodyContext()
2037                self.endTagTableRowGroup(
2038                    impliedTagToken(self.tree.openElements[-1].name))
2039                return token
2040            else:
2041                # innerHTML case
2042                assert self.parser.innerHTML
2043                self.parser.parseError()
2044
2045        def endTagIgnore(self, token):
2046            self.parser.parseError("unexpected-end-tag-in-table-body",
2047                                   {"name": token["name"]})
2048
2049        def endTagOther(self, token):
2050            return self.parser.phases["inTable"].processEndTag(token)
2051
2052    class InRowPhase(Phase):
2053        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2054        def __init__(self, parser, tree):
2055            Phase.__init__(self, parser, tree)
2056            self.startTagHandler = utils.MethodDispatcher([
2057                ("html", self.startTagHtml),
2058                (("td", "th"), self.startTagTableCell),
2059                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2060                  "tr"), self.startTagTableOther)
2061            ])
2062            self.startTagHandler.default = self.startTagOther
2063
2064            self.endTagHandler = utils.MethodDispatcher([
2065                ("tr", self.endTagTr),
2066                ("table", self.endTagTable),
2067                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
2068                (("body", "caption", "col", "colgroup", "html", "td", "th"),
2069                 self.endTagIgnore)
2070            ])
2071            self.endTagHandler.default = self.endTagOther
2072
2073        # helper methods (XXX unify this with other table helper methods)
2074        def clearStackToTableRowContext(self):
2075            while self.tree.openElements[-1].name not in ("tr", "html"):
2076                self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2077                                       {"name": self.tree.openElements[-1].name})
2078                self.tree.openElements.pop()
2079
2080        def ignoreEndTagTr(self):
2081            return not self.tree.elementInScope("tr", variant="table")
2082
2083        # the rest
2084        def processEOF(self):
2085            self.parser.phases["inTable"].processEOF()
2086
2087        def processSpaceCharacters(self, token):
2088            return self.parser.phases["inTable"].processSpaceCharacters(token)
2089
2090        def processCharacters(self, token):
2091            return self.parser.phases["inTable"].processCharacters(token)
2092
2093        def startTagTableCell(self, token):
2094            self.clearStackToTableRowContext()
2095            self.tree.insertElement(token)
2096            self.parser.phase = self.parser.phases["inCell"]
2097            self.tree.activeFormattingElements.append(Marker)
2098
2099        def startTagTableOther(self, token):
2100            ignoreEndTag = self.ignoreEndTagTr()
2101            self.endTagTr(impliedTagToken("tr"))
2102            # XXX how are we sure it's always ignored in the innerHTML case?
2103            if not ignoreEndTag:
2104                return token
2105
2106        def startTagOther(self, token):
2107            return self.parser.phases["inTable"].processStartTag(token)
2108
2109        def endTagTr(self, token):
2110            if not self.ignoreEndTagTr():
2111                self.clearStackToTableRowContext()
2112                self.tree.openElements.pop()
2113                self.parser.phase = self.parser.phases["inTableBody"]
2114            else:
2115                # innerHTML case
2116                assert self.parser.innerHTML
2117                self.parser.parseError()
2118
2119        def endTagTable(self, token):
2120            ignoreEndTag = self.ignoreEndTagTr()
2121            self.endTagTr(impliedTagToken("tr"))
2122            # Reprocess the current tag if the tr end tag was not ignored
2123            # XXX how are we sure it's always ignored in the innerHTML case?
2124            if not ignoreEndTag:
2125                return token
2126
2127        def endTagTableRowGroup(self, token):
2128            if self.tree.elementInScope(token["name"], variant="table"):
2129                self.endTagTr(impliedTagToken("tr"))
2130                return token
2131            else:
2132                self.parser.parseError()
2133
2134        def endTagIgnore(self, token):
2135            self.parser.parseError("unexpected-end-tag-in-table-row",
2136                                   {"name": token["name"]})
2137
2138        def endTagOther(self, token):
2139            return self.parser.phases["inTable"].processEndTag(token)
2140
2141    class InCellPhase(Phase):
2142        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2143        def __init__(self, parser, tree):
2144            Phase.__init__(self, parser, tree)
2145            self.startTagHandler = utils.MethodDispatcher([
2146                ("html", self.startTagHtml),
2147                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2148                  "thead", "tr"), self.startTagTableOther)
2149            ])
2150            self.startTagHandler.default = self.startTagOther
2151
2152            self.endTagHandler = utils.MethodDispatcher([
2153                (("td", "th"), self.endTagTableCell),
2154                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
2155                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
2156            ])
2157            self.endTagHandler.default = self.endTagOther
2158
2159        # helper
2160        def closeCell(self):
2161            if self.tree.elementInScope("td", variant="table"):
2162                self.endTagTableCell(impliedTagToken("td"))
2163            elif self.tree.elementInScope("th", variant="table"):
2164                self.endTagTableCell(impliedTagToken("th"))
2165
2166        # the rest
2167        def processEOF(self):
2168            self.parser.phases["inBody"].processEOF()
2169
2170        def processCharacters(self, token):
2171            return self.parser.phases["inBody"].processCharacters(token)
2172
2173        def startTagTableOther(self, token):
2174            if (self.tree.elementInScope("td", variant="table") or
2175                    self.tree.elementInScope("th", variant="table")):
2176                self.closeCell()
2177                return token
2178            else:
2179                # innerHTML case
2180                assert self.parser.innerHTML
2181                self.parser.parseError()
2182
2183        def startTagOther(self, token):
2184            return self.parser.phases["inBody"].processStartTag(token)
2185
2186        def endTagTableCell(self, token):
2187            if self.tree.elementInScope(token["name"], variant="table"):
2188                self.tree.generateImpliedEndTags(token["name"])
2189                if self.tree.openElements[-1].name != token["name"]:
2190                    self.parser.parseError("unexpected-cell-end-tag",
2191                                           {"name": token["name"]})
2192                    while True:
2193                        node = self.tree.openElements.pop()
2194                        if node.name == token["name"]:
2195                            break
2196                else:
2197                    self.tree.openElements.pop()
2198                self.tree.clearActiveFormattingElements()
2199                self.parser.phase = self.parser.phases["inRow"]
2200            else:
2201                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2202
2203        def endTagIgnore(self, token):
2204            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2205
2206        def endTagImply(self, token):
2207            if self.tree.elementInScope(token["name"], variant="table"):
2208                self.closeCell()
2209                return token
2210            else:
2211                # sometimes innerHTML case
2212                self.parser.parseError()
2213
2214        def endTagOther(self, token):
2215            return self.parser.phases["inBody"].processEndTag(token)
2216
2217    class InSelectPhase(Phase):
2218        def __init__(self, parser, tree):
2219            Phase.__init__(self, parser, tree)
2220
2221            self.startTagHandler = utils.MethodDispatcher([
2222                ("html", self.startTagHtml),
2223                ("option", self.startTagOption),
2224                ("optgroup", self.startTagOptgroup),
2225                ("select", self.startTagSelect),
2226                (("input", "keygen", "textarea"), self.startTagInput),
2227                ("script", self.startTagScript)
2228            ])
2229            self.startTagHandler.default = self.startTagOther
2230
2231            self.endTagHandler = utils.MethodDispatcher([
2232                ("option", self.endTagOption),
2233                ("optgroup", self.endTagOptgroup),
2234                ("select", self.endTagSelect)
2235            ])
2236            self.endTagHandler.default = self.endTagOther
2237
2238        # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2239        def processEOF(self):
2240            if self.tree.openElements[-1].name != "html":
2241                self.parser.parseError("eof-in-select")
2242            else:
2243                assert self.parser.innerHTML
2244
2245        def processCharacters(self, token):
2246            if token["data"] == "\u0000":
2247                return
2248            self.tree.insertText(token["data"])
2249
2250        def startTagOption(self, token):
2251            # We need to imply </option> if <option> is the current node.
2252            if self.tree.openElements[-1].name == "option":
2253                self.tree.openElements.pop()
2254            self.tree.insertElement(token)
2255
2256        def startTagOptgroup(self, token):
2257            if self.tree.openElements[-1].name == "option":
2258                self.tree.openElements.pop()
2259            if self.tree.openElements[-1].name == "optgroup":
2260                self.tree.openElements.pop()
2261            self.tree.insertElement(token)
2262
2263        def startTagSelect(self, token):
2264            self.parser.parseError("unexpected-select-in-select")
2265            self.endTagSelect(impliedTagToken("select"))
2266
2267        def startTagInput(self, token):
2268            self.parser.parseError("unexpected-input-in-select")
2269            if self.tree.elementInScope("select", variant="select"):
2270                self.endTagSelect(impliedTagToken("select"))
2271                return token
2272            else:
2273                assert self.parser.innerHTML
2274
2275        def startTagScript(self, token):
2276            return self.parser.phases["inHead"].processStartTag(token)
2277
2278        def startTagOther(self, token):
2279            self.parser.parseError("unexpected-start-tag-in-select",
2280                                   {"name": token["name"]})
2281
2282        def endTagOption(self, token):
2283            if self.tree.openElements[-1].name == "option":
2284                self.tree.openElements.pop()
2285            else:
2286                self.parser.parseError("unexpected-end-tag-in-select",
2287                                       {"name": "option"})
2288
2289        def endTagOptgroup(self, token):
2290            # </optgroup> implicitly closes <option>
2291            if (self.tree.openElements[-1].name == "option" and
2292                    self.tree.openElements[-2].name == "optgroup"):
2293                self.tree.openElements.pop()
2294            # It also closes </optgroup>
2295            if self.tree.openElements[-1].name == "optgroup":
2296                self.tree.openElements.pop()
2297            # But nothing else
2298            else:
2299                self.parser.parseError("unexpected-end-tag-in-select",
2300                                       {"name": "optgroup"})
2301
2302        def endTagSelect(self, token):
2303            if self.tree.elementInScope("select", variant="select"):
2304                node = self.tree.openElements.pop()
2305                while node.name != "select":
2306                    node = self.tree.openElements.pop()
2307                self.parser.resetInsertionMode()
2308            else:
2309                # innerHTML case
2310                assert self.parser.innerHTML
2311                self.parser.parseError()
2312
2313        def endTagOther(self, token):
2314            self.parser.parseError("unexpected-end-tag-in-select",
2315                                   {"name": token["name"]})
2316
2317    class InSelectInTablePhase(Phase):
2318        def __init__(self, parser, tree):
2319            Phase.__init__(self, parser, tree)
2320
2321            self.startTagHandler = utils.MethodDispatcher([
2322                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2323                 self.startTagTable)
2324            ])
2325            self.startTagHandler.default = self.startTagOther
2326
2327            self.endTagHandler = utils.MethodDispatcher([
2328                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2329                 self.endTagTable)
2330            ])
2331            self.endTagHandler.default = self.endTagOther
2332
2333        def processEOF(self):
2334            self.parser.phases["inSelect"].processEOF()
2335
2336        def processCharacters(self, token):
2337            return self.parser.phases["inSelect"].processCharacters(token)
2338
2339        def startTagTable(self, token):
2340            self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2341            self.endTagOther(impliedTagToken("select"))
2342            return token
2343
2344        def startTagOther(self, token):
2345            return self.parser.phases["inSelect"].processStartTag(token)
2346
2347        def endTagTable(self, token):
2348            self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2349            if self.tree.elementInScope(token["name"], variant="table"):
2350                self.endTagOther(impliedTagToken("select"))
2351                return token
2352
2353        def endTagOther(self, token):
2354            return self.parser.phases["inSelect"].processEndTag(token)
2355
2356    class InForeignContentPhase(Phase):
2357        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2358                                      "center", "code", "dd", "div", "dl", "dt",
2359                                      "em", "embed", "h1", "h2", "h3",
2360                                      "h4", "h5", "h6", "head", "hr", "i", "img",
2361                                      "li", "listing", "menu", "meta", "nobr",
2362                                      "ol", "p", "pre", "ruby", "s", "small",
2363                                      "span", "strong", "strike", "sub", "sup",
2364                                      "table", "tt", "u", "ul", "var"])
2365
2366        def __init__(self, parser, tree):
2367            Phase.__init__(self, parser, tree)
2368
2369        def adjustSVGTagNames(self, token):
2370            replacements = {"altglyph": "altGlyph",
2371                            "altglyphdef": "altGlyphDef",
2372                            "altglyphitem": "altGlyphItem",
2373                            "animatecolor": "animateColor",
2374                            "animatemotion": "animateMotion",
2375                            "animatetransform": "animateTransform",
2376                            "clippath": "clipPath",
2377                            "feblend": "feBlend",
2378                            "fecolormatrix": "feColorMatrix",
2379                            "fecomponenttransfer": "feComponentTransfer",
2380                            "fecomposite": "feComposite",
2381                            "feconvolvematrix": "feConvolveMatrix",
2382                            "fediffuselighting": "feDiffuseLighting",
2383                            "fedisplacementmap": "feDisplacementMap",
2384                            "fedistantlight": "feDistantLight",
2385                            "feflood": "feFlood",
2386                            "fefunca": "feFuncA",
2387                            "fefuncb": "feFuncB",
2388                            "fefuncg": "feFuncG",
2389                            "fefuncr": "feFuncR",
2390                            "fegaussianblur": "feGaussianBlur",
2391                            "feimage": "feImage",
2392                            "femerge": "feMerge",
2393                            "femergenode": "feMergeNode",
2394                            "femorphology": "feMorphology",
2395                            "feoffset": "feOffset",
2396                            "fepointlight": "fePointLight",
2397                            "fespecularlighting": "feSpecularLighting",
2398                            "fespotlight": "feSpotLight",
2399                            "fetile": "feTile",
2400                            "feturbulence": "feTurbulence",
2401                            "foreignobject": "foreignObject",
2402                            "glyphref": "glyphRef",
2403                            "lineargradient": "linearGradient",
2404                            "radialgradient": "radialGradient",
2405                            "textpath": "textPath"}
2406
2407            if token["name"] in replacements:
2408                token["name"] = replacements[token["name"]]
2409
2410        def processCharacters(self, token):
2411            if token["data"] == "\u0000":
2412                token["data"] = "\uFFFD"
2413            elif (self.parser.framesetOK and
2414                  any(char not in spaceCharacters for char in token["data"])):
2415                self.parser.framesetOK = False
2416            Phase.processCharacters(self, token)
2417
2418        def processStartTag(self, token):
2419            currentNode = self.tree.openElements[-1]
2420            if (token["name"] in self.breakoutElements or
2421                (token["name"] == "font" and
2422                 set(token["data"].keys()) & set(["color", "face", "size"]))):
2423                self.parser.parseError("unexpected-html-element-in-foreign-content",
2424                                       {"name": token["name"]})
2425                while (self.tree.openElements[-1].namespace !=
2426                       self.tree.defaultNamespace and
2427                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2428                       not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2429                    self.tree.openElements.pop()
2430                return token
2431
2432            else:
2433                if currentNode.namespace == namespaces["mathml"]:
2434                    self.parser.adjustMathMLAttributes(token)
2435                elif currentNode.namespace == namespaces["svg"]:
2436                    self.adjustSVGTagNames(token)
2437                    self.parser.adjustSVGAttributes(token)
2438                self.parser.adjustForeignAttributes(token)
2439                token["namespace"] = currentNode.namespace
2440                self.tree.insertElement(token)
2441                if token["selfClosing"]:
2442                    self.tree.openElements.pop()
2443                    token["selfClosingAcknowledged"] = True
2444
2445        def processEndTag(self, token):
2446            nodeIndex = len(self.tree.openElements) - 1
2447            node = self.tree.openElements[-1]
2448            if node.name != token["name"]:
2449                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2450
2451            while True:
2452                if node.name.translate(asciiUpper2Lower) == token["name"]:
2453                    # XXX this isn't in the spec but it seems necessary
2454                    if self.parser.phase == self.parser.phases["inTableText"]:
2455                        self.parser.phase.flushCharacters()
2456                        self.parser.phase = self.parser.phase.originalPhase
2457                    while self.tree.openElements.pop() != node:
2458                        assert self.tree.openElements
2459                    new_token = None
2460                    break
2461                nodeIndex -= 1
2462
2463                node = self.tree.openElements[nodeIndex]
2464                if node.namespace != self.tree.defaultNamespace:
2465                    continue
2466                else:
2467                    new_token = self.parser.phase.processEndTag(token)
2468                    break
2469            return new_token
2470
2471    class AfterBodyPhase(Phase):
2472        def __init__(self, parser, tree):
2473            Phase.__init__(self, parser, tree)
2474
2475            self.startTagHandler = utils.MethodDispatcher([
2476                ("html", self.startTagHtml)
2477            ])
2478            self.startTagHandler.default = self.startTagOther
2479
2480            self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
2481            self.endTagHandler.default = self.endTagOther
2482
2483        def processEOF(self):
2484            # Stop parsing
2485            pass
2486
2487        def processComment(self, token):
2488            # This is needed because data is to be appended to the <html> element
2489            # here and not to whatever is currently open.
2490            self.tree.insertComment(token, self.tree.openElements[0])
2491
2492        def processCharacters(self, token):
2493            self.parser.parseError("unexpected-char-after-body")
2494            self.parser.phase = self.parser.phases["inBody"]
2495            return token
2496
2497        def startTagHtml(self, token):
2498            return self.parser.phases["inBody"].processStartTag(token)
2499
2500        def startTagOther(self, token):
2501            self.parser.parseError("unexpected-start-tag-after-body",
2502                                   {"name": token["name"]})
2503            self.parser.phase = self.parser.phases["inBody"]
2504            return token
2505
2506        def endTagHtml(self, name):
2507            if self.parser.innerHTML:
2508                self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2509            else:
2510                self.parser.phase = self.parser.phases["afterAfterBody"]
2511
2512        def endTagOther(self, token):
2513            self.parser.parseError("unexpected-end-tag-after-body",
2514                                   {"name": token["name"]})
2515            self.parser.phase = self.parser.phases["inBody"]
2516            return token
2517
2518    class InFramesetPhase(Phase):
2519        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2520        def __init__(self, parser, tree):
2521            Phase.__init__(self, parser, tree)
2522
2523            self.startTagHandler = utils.MethodDispatcher([
2524                ("html", self.startTagHtml),
2525                ("frameset", self.startTagFrameset),
2526                ("frame", self.startTagFrame),
2527                ("noframes", self.startTagNoframes)
2528            ])
2529            self.startTagHandler.default = self.startTagOther
2530
2531            self.endTagHandler = utils.MethodDispatcher([
2532                ("frameset", self.endTagFrameset)
2533            ])
2534            self.endTagHandler.default = self.endTagOther
2535
2536        def processEOF(self):
2537            if self.tree.openElements[-1].name != "html":
2538                self.parser.parseError("eof-in-frameset")
2539            else:
2540                assert self.parser.innerHTML
2541
2542        def processCharacters(self, token):
2543            self.parser.parseError("unexpected-char-in-frameset")
2544
2545        def startTagFrameset(self, token):
2546            self.tree.insertElement(token)
2547
2548        def startTagFrame(self, token):
2549            self.tree.insertElement(token)
2550            self.tree.openElements.pop()
2551
2552        def startTagNoframes(self, token):
2553            return self.parser.phases["inBody"].processStartTag(token)
2554
2555        def startTagOther(self, token):
2556            self.parser.parseError("unexpected-start-tag-in-frameset",
2557                                   {"name": token["name"]})
2558
2559        def endTagFrameset(self, token):
2560            if self.tree.openElements[-1].name == "html":
2561                # innerHTML case
2562                self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2563            else:
2564                self.tree.openElements.pop()
2565            if (not self.parser.innerHTML and
2566                    self.tree.openElements[-1].name != "frameset"):
2567                # If we're not in innerHTML mode and the the current node is not a
2568                # "frameset" element (anymore) then switch.
2569                self.parser.phase = self.parser.phases["afterFrameset"]
2570
2571        def endTagOther(self, token):
2572            self.parser.parseError("unexpected-end-tag-in-frameset",
2573                                   {"name": token["name"]})
2574
2575    class AfterFramesetPhase(Phase):
2576        # http://www.whatwg.org/specs/web-apps/current-work/#after3
2577        def __init__(self, parser, tree):
2578            Phase.__init__(self, parser, tree)
2579
2580            self.startTagHandler = utils.MethodDispatcher([
2581                ("html", self.startTagHtml),
2582                ("noframes", self.startTagNoframes)
2583            ])
2584            self.startTagHandler.default = self.startTagOther
2585
2586            self.endTagHandler = utils.MethodDispatcher([
2587                ("html", self.endTagHtml)
2588            ])
2589            self.endTagHandler.default = self.endTagOther
2590
2591        def processEOF(self):
2592            # Stop parsing
2593            pass
2594
2595        def processCharacters(self, token):
2596            self.parser.parseError("unexpected-char-after-frameset")
2597
2598        def startTagNoframes(self, token):
2599            return self.parser.phases["inHead"].processStartTag(token)
2600
2601        def startTagOther(self, token):
2602            self.parser.parseError("unexpected-start-tag-after-frameset",
2603                                   {"name": token["name"]})
2604
2605        def endTagHtml(self, token):
2606            self.parser.phase = self.parser.phases["afterAfterFrameset"]
2607
2608        def endTagOther(self, token):
2609            self.parser.parseError("unexpected-end-tag-after-frameset",
2610                                   {"name": token["name"]})
2611
2612    class AfterAfterBodyPhase(Phase):
2613        def __init__(self, parser, tree):
2614            Phase.__init__(self, parser, tree)
2615
2616            self.startTagHandler = utils.MethodDispatcher([
2617                ("html", self.startTagHtml)
2618            ])
2619            self.startTagHandler.default = self.startTagOther
2620
2621        def processEOF(self):
2622            pass
2623
2624        def processComment(self, token):
2625            self.tree.insertComment(token, self.tree.document)
2626
2627        def processSpaceCharacters(self, token):
2628            return self.parser.phases["inBody"].processSpaceCharacters(token)
2629
2630        def processCharacters(self, token):
2631            self.parser.parseError("expected-eof-but-got-char")
2632            self.parser.phase = self.parser.phases["inBody"]
2633            return token
2634
2635        def startTagHtml(self, token):
2636            return self.parser.phases["inBody"].processStartTag(token)
2637
2638        def startTagOther(self, token):
2639            self.parser.parseError("expected-eof-but-got-start-tag",
2640                                   {"name": token["name"]})
2641            self.parser.phase = self.parser.phases["inBody"]
2642            return token
2643
2644        def processEndTag(self, token):
2645            self.parser.parseError("expected-eof-but-got-end-tag",
2646                                   {"name": token["name"]})
2647            self.parser.phase = self.parser.phases["inBody"]
2648            return token
2649
2650    class AfterAfterFramesetPhase(Phase):
2651        def __init__(self, parser, tree):
2652            Phase.__init__(self, parser, tree)
2653
2654            self.startTagHandler = utils.MethodDispatcher([
2655                ("html", self.startTagHtml),
2656                ("noframes", self.startTagNoFrames)
2657            ])
2658            self.startTagHandler.default = self.startTagOther
2659
2660        def processEOF(self):
2661            pass
2662
2663        def processComment(self, token):
2664            self.tree.insertComment(token, self.tree.document)
2665
2666        def processSpaceCharacters(self, token):
2667            return self.parser.phases["inBody"].processSpaceCharacters(token)
2668
2669        def processCharacters(self, token):
2670            self.parser.parseError("expected-eof-but-got-char")
2671
2672        def startTagHtml(self, token):
2673            return self.parser.phases["inBody"].processStartTag(token)
2674
2675        def startTagNoFrames(self, token):
2676            return self.parser.phases["inHead"].processStartTag(token)
2677
2678        def startTagOther(self, token):
2679            self.parser.parseError("expected-eof-but-got-start-tag",
2680                                   {"name": token["name"]})
2681
2682        def processEndTag(self, token):
2683            self.parser.parseError("expected-eof-but-got-end-tag",
2684                                   {"name": token["name"]})
2685
2686    return {
2687        "initial": InitialPhase,
2688        "beforeHtml": BeforeHtmlPhase,
2689        "beforeHead": BeforeHeadPhase,
2690        "inHead": InHeadPhase,
2691        # XXX "inHeadNoscript": InHeadNoScriptPhase,
2692        "afterHead": AfterHeadPhase,
2693        "inBody": InBodyPhase,
2694        "text": TextPhase,
2695        "inTable": InTablePhase,
2696        "inTableText": InTableTextPhase,
2697        "inCaption": InCaptionPhase,
2698        "inColumnGroup": InColumnGroupPhase,
2699        "inTableBody": InTableBodyPhase,
2700        "inRow": InRowPhase,
2701        "inCell": InCellPhase,
2702        "inSelect": InSelectPhase,
2703        "inSelectInTable": InSelectInTablePhase,
2704        "inForeignContent": InForeignContentPhase,
2705        "afterBody": AfterBodyPhase,
2706        "inFrameset": InFramesetPhase,
2707        "afterFrameset": AfterFramesetPhase,
2708        "afterAfterBody": AfterAfterBodyPhase,
2709        "afterAfterFrameset": AfterAfterFramesetPhase,
2710        # XXX after after frameset
2711    }
2712
2713
2714def impliedTagToken(name, type="EndTag", attributes=None,
2715                    selfClosing=False):
2716    if attributes is None:
2717        attributes = {}
2718    return {"type": tokenTypes[type], "name": name, "data": attributes,
2719            "selfClosing": selfClosing}
2720
2721
2722class ParseError(Exception):
2723    """Error in parsed document"""
2724    pass
2725