1from __future__ import absolute_import, division, unicode_literals
2from six import text_type
3from six.moves import http_client
4
5import codecs
6import re
7
8from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
9from .constants import encodings, ReparseException
10from . import utils
11
12from io import StringIO
13
14try:
15    from io import BytesIO
16except ImportError:
17    BytesIO = StringIO
18
19try:
20    from io import BufferedIOBase
21except ImportError:
22    class BufferedIOBase(object):
23        pass
24
25# Non-unicode versions of constants for use in the pre-parser
26spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
27asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
28asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
29spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
30
31
32invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
33
34if utils.supports_lone_surrogates:
35    # Use one extra step of indirection and create surrogates with
36    # unichr. Not using this indirection would introduce an illegal
37    # unicode literal on platforms not supporting such lone
38    # surrogates.
39    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
40                                    eval('"\\uD800-\\uDFFF"'))
41else:
42    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
43
44non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
45                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
46                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
47                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
48                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
49                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
50                                  0x10FFFE, 0x10FFFF])
51
52ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
53
54# Cache for charsUntil()
55charsUntilRegEx = {}
56
57
58class BufferedStream(object):
59    """Buffering for streams that do not have buffering of their own
60
61    The buffer is implemented as a list of chunks on the assumption that
62    joining many strings will be slow since it is O(n**2)
63    """
64
65    def __init__(self, stream):
66        self.stream = stream
67        self.buffer = []
68        self.position = [-1, 0]  # chunk number, offset
69
70    def tell(self):
71        pos = 0
72        for chunk in self.buffer[:self.position[0]]:
73            pos += len(chunk)
74        pos += self.position[1]
75        return pos
76
77    def seek(self, pos):
78        assert pos <= self._bufferedBytes()
79        offset = pos
80        i = 0
81        while len(self.buffer[i]) < offset:
82            offset -= len(self.buffer[i])
83            i += 1
84        self.position = [i, offset]
85
86    def read(self, bytes):
87        if not self.buffer:
88            return self._readStream(bytes)
89        elif (self.position[0] == len(self.buffer) and
90              self.position[1] == len(self.buffer[-1])):
91            return self._readStream(bytes)
92        else:
93            return self._readFromBuffer(bytes)
94
95    def _bufferedBytes(self):
96        return sum([len(item) for item in self.buffer])
97
98    def _readStream(self, bytes):
99        data = self.stream.read(bytes)
100        self.buffer.append(data)
101        self.position[0] += 1
102        self.position[1] = len(data)
103        return data
104
105    def _readFromBuffer(self, bytes):
106        remainingBytes = bytes
107        rv = []
108        bufferIndex = self.position[0]
109        bufferOffset = self.position[1]
110        while bufferIndex < len(self.buffer) and remainingBytes != 0:
111            assert remainingBytes > 0
112            bufferedData = self.buffer[bufferIndex]
113
114            if remainingBytes <= len(bufferedData) - bufferOffset:
115                bytesToRead = remainingBytes
116                self.position = [bufferIndex, bufferOffset + bytesToRead]
117            else:
118                bytesToRead = len(bufferedData) - bufferOffset
119                self.position = [bufferIndex, len(bufferedData)]
120                bufferIndex += 1
121            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
122            remainingBytes -= bytesToRead
123
124            bufferOffset = 0
125
126        if remainingBytes:
127            rv.append(self._readStream(remainingBytes))
128
129        return b"".join(rv)
130
131
132def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
133    if isinstance(source, http_client.HTTPResponse):
134        # Work around Python bug #20007: read(0) closes the connection.
135        # http://bugs.python.org/issue20007
136        isUnicode = False
137    elif hasattr(source, "read"):
138        isUnicode = isinstance(source.read(0), text_type)
139    else:
140        isUnicode = isinstance(source, text_type)
141
142    if isUnicode:
143        if encoding is not None:
144            raise TypeError("Cannot explicitly set an encoding with a unicode string")
145
146        return HTMLUnicodeInputStream(source)
147    else:
148        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
149
150
151class HTMLUnicodeInputStream(object):
152    """Provides a unicode stream of characters to the HTMLTokenizer.
153
154    This class takes care of character encoding and removing or replacing
155    incorrect byte-sequences and also provides column and line tracking.
156
157    """
158
159    _defaultChunkSize = 10240
160
161    def __init__(self, source):
162        """Initialises the HTMLInputStream.
163
164        HTMLInputStream(source, [encoding]) -> Normalized stream from source
165        for use by html5lib.
166
167        source can be either a file-object, local filename or a string.
168
169        The optional encoding parameter must be a string that indicates
170        the encoding.  If specified, that encoding will be used,
171        regardless of any BOM or later declaration (such as in a meta
172        element)
173
174        parseMeta - Look for a <meta> element containing encoding information
175
176        """
177
178        if not utils.supports_lone_surrogates:
179            # Such platforms will have already checked for such
180            # surrogate errors, so no need to do this checking.
181            self.reportCharacterErrors = None
182            self.replaceCharactersRegexp = None
183        elif len("\U0010FFFF") == 1:
184            self.reportCharacterErrors = self.characterErrorsUCS4
185            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
186        else:
187            self.reportCharacterErrors = self.characterErrorsUCS2
188            self.replaceCharactersRegexp = re.compile(
189                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
190
191        # List of where new lines occur
192        self.newLines = [0]
193
194        self.charEncoding = ("utf-8", "certain")
195        self.dataStream = self.openStream(source)
196
197        self.reset()
198
199    def reset(self):
200        self.chunk = ""
201        self.chunkSize = 0
202        self.chunkOffset = 0
203        self.errors = []
204
205        # number of (complete) lines in previous chunks
206        self.prevNumLines = 0
207        # number of columns in the last line of the previous chunk
208        self.prevNumCols = 0
209
210        # Deal with CR LF and surrogates split over chunk boundaries
211        self._bufferedCharacter = None
212
213    def openStream(self, source):
214        """Produces a file object from source.
215
216        source can be either a file object, local filename or a string.
217
218        """
219        # Already a file object
220        if hasattr(source, 'read'):
221            stream = source
222        else:
223            stream = StringIO(source)
224
225        return stream
226
227    def _position(self, offset):
228        chunk = self.chunk
229        nLines = chunk.count('\n', 0, offset)
230        positionLine = self.prevNumLines + nLines
231        lastLinePos = chunk.rfind('\n', 0, offset)
232        if lastLinePos == -1:
233            positionColumn = self.prevNumCols + offset
234        else:
235            positionColumn = offset - (lastLinePos + 1)
236        return (positionLine, positionColumn)
237
238    def position(self):
239        """Returns (line, col) of the current position in the stream."""
240        line, col = self._position(self.chunkOffset)
241        return (line + 1, col)
242
243    def char(self):
244        """ Read one character from the stream or queue if available. Return
245            EOF when EOF is reached.
246        """
247        # Read a new chunk from the input stream if necessary
248        if self.chunkOffset >= self.chunkSize:
249            if not self.readChunk():
250                return EOF
251
252        chunkOffset = self.chunkOffset
253        char = self.chunk[chunkOffset]
254        self.chunkOffset = chunkOffset + 1
255
256        return char
257
258    def readChunk(self, chunkSize=None):
259        if chunkSize is None:
260            chunkSize = self._defaultChunkSize
261
262        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
263
264        self.chunk = ""
265        self.chunkSize = 0
266        self.chunkOffset = 0
267
268        data = self.dataStream.read(chunkSize)
269
270        # Deal with CR LF and surrogates broken across chunks
271        if self._bufferedCharacter:
272            data = self._bufferedCharacter + data
273            self._bufferedCharacter = None
274        elif not data:
275            # We have no more data, bye-bye stream
276            return False
277
278        if len(data) > 1:
279            lastv = ord(data[-1])
280            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
281                self._bufferedCharacter = data[-1]
282                data = data[:-1]
283
284        if self.reportCharacterErrors:
285            self.reportCharacterErrors(data)
286
287            # Replace invalid characters
288            # Note U+0000 is dealt with in the tokenizer
289            data = self.replaceCharactersRegexp.sub("\ufffd", data)
290
291        data = data.replace("\r\n", "\n")
292        data = data.replace("\r", "\n")
293
294        self.chunk = data
295        self.chunkSize = len(data)
296
297        return True
298
299    def characterErrorsUCS4(self, data):
300        for i in range(len(invalid_unicode_re.findall(data))):
301            self.errors.append("invalid-codepoint")
302
303    def characterErrorsUCS2(self, data):
304        # Someone picked the wrong compile option
305        # You lose
306        skip = False
307        for match in invalid_unicode_re.finditer(data):
308            if skip:
309                continue
310            codepoint = ord(match.group())
311            pos = match.start()
312            # Pretty sure there should be endianness issues here
313            if utils.isSurrogatePair(data[pos:pos + 2]):
314                # We have a surrogate pair!
315                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
316                if char_val in non_bmp_invalid_codepoints:
317                    self.errors.append("invalid-codepoint")
318                skip = True
319            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
320                  pos == len(data) - 1):
321                self.errors.append("invalid-codepoint")
322            else:
323                skip = False
324                self.errors.append("invalid-codepoint")
325
326    def charsUntil(self, characters, opposite=False):
327        """ Returns a string of characters from the stream up to but not
328        including any character in 'characters' or EOF. 'characters' must be
329        a container that supports the 'in' method and iteration over its
330        characters.
331        """
332
333        # Use a cache of regexps to find the required characters
334        try:
335            chars = charsUntilRegEx[(characters, opposite)]
336        except KeyError:
337            if __debug__:
338                for c in characters:
339                    assert(ord(c) < 128)
340            regex = "".join(["\\x%02x" % ord(c) for c in characters])
341            if not opposite:
342                regex = "^%s" % regex
343            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
344
345        rv = []
346
347        while True:
348            # Find the longest matching prefix
349            m = chars.match(self.chunk, self.chunkOffset)
350            if m is None:
351                # If nothing matched, and it wasn't because we ran out of chunk,
352                # then stop
353                if self.chunkOffset != self.chunkSize:
354                    break
355            else:
356                end = m.end()
357                # If not the whole chunk matched, return everything
358                # up to the part that didn't match
359                if end != self.chunkSize:
360                    rv.append(self.chunk[self.chunkOffset:end])
361                    self.chunkOffset = end
362                    break
363            # If the whole remainder of the chunk matched,
364            # use it all and read the next chunk
365            rv.append(self.chunk[self.chunkOffset:])
366            if not self.readChunk():
367                # Reached EOF
368                break
369
370        r = "".join(rv)
371        return r
372
373    def unget(self, char):
374        # Only one character is allowed to be ungotten at once - it must
375        # be consumed again before any further call to unget
376        if char is not None:
377            if self.chunkOffset == 0:
378                # unget is called quite rarely, so it's a good idea to do
379                # more work here if it saves a bit of work in the frequently
380                # called char and charsUntil.
381                # So, just prepend the ungotten character onto the current
382                # chunk:
383                self.chunk = char + self.chunk
384                self.chunkSize += 1
385            else:
386                self.chunkOffset -= 1
387                assert self.chunk[self.chunkOffset] == char
388
389
390class HTMLBinaryInputStream(HTMLUnicodeInputStream):
391    """Provides a unicode stream of characters to the HTMLTokenizer.
392
393    This class takes care of character encoding and removing or replacing
394    incorrect byte-sequences and also provides column and line tracking.
395
396    """
397
398    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
399        """Initialises the HTMLInputStream.
400
401        HTMLInputStream(source, [encoding]) -> Normalized stream from source
402        for use by html5lib.
403
404        source can be either a file-object, local filename or a string.
405
406        The optional encoding parameter must be a string that indicates
407        the encoding.  If specified, that encoding will be used,
408        regardless of any BOM or later declaration (such as in a meta
409        element)
410
411        parseMeta - Look for a <meta> element containing encoding information
412
413        """
414        # Raw Stream - for unicode objects this will encode to utf-8 and set
415        #              self.charEncoding as appropriate
416        self.rawStream = self.openStream(source)
417
418        HTMLUnicodeInputStream.__init__(self, self.rawStream)
419
420        self.charEncoding = (codecName(encoding), "certain")
421
422        # Encoding Information
423        # Number of bytes to use when looking for a meta element with
424        # encoding information
425        self.numBytesMeta = 512
426        # Number of bytes to use when using detecting encoding using chardet
427        self.numBytesChardet = 100
428        # Encoding to use if no other information can be found
429        self.defaultEncoding = "windows-1252"
430
431        # Detect encoding iff no explicit "transport level" encoding is supplied
432        if (self.charEncoding[0] is None):
433            self.charEncoding = self.detectEncoding(parseMeta, chardet)
434
435        # Call superclass
436        self.reset()
437
438    def reset(self):
439        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
440                                                                 'replace')
441        HTMLUnicodeInputStream.reset(self)
442
443    def openStream(self, source):
444        """Produces a file object from source.
445
446        source can be either a file object, local filename or a string.
447
448        """
449        # Already a file object
450        if hasattr(source, 'read'):
451            stream = source
452        else:
453            stream = BytesIO(source)
454
455        try:
456            stream.seek(stream.tell())
457        except:
458            stream = BufferedStream(stream)
459
460        return stream
461
462    def detectEncoding(self, parseMeta=True, chardet=True):
463        # First look for a BOM
464        # This will also read past the BOM if present
465        encoding = self.detectBOM()
466        confidence = "certain"
467        # If there is no BOM need to look for meta elements with encoding
468        # information
469        if encoding is None and parseMeta:
470            encoding = self.detectEncodingMeta()
471            confidence = "tentative"
472        # Guess with chardet, if avaliable
473        if encoding is None and chardet:
474            confidence = "tentative"
475            try:
476                try:
477                    from charade.universaldetector import UniversalDetector
478                except ImportError:
479                    from chardet.universaldetector import UniversalDetector
480                buffers = []
481                detector = UniversalDetector()
482                while not detector.done:
483                    buffer = self.rawStream.read(self.numBytesChardet)
484                    assert isinstance(buffer, bytes)
485                    if not buffer:
486                        break
487                    buffers.append(buffer)
488                    detector.feed(buffer)
489                detector.close()
490                encoding = detector.result['encoding']
491                self.rawStream.seek(0)
492            except ImportError:
493                pass
494        # If all else fails use the default encoding
495        if encoding is None:
496            confidence = "tentative"
497            encoding = self.defaultEncoding
498
499        # Substitute for equivalent encodings:
500        encodingSub = {"iso-8859-1": "windows-1252"}
501
502        if encoding.lower() in encodingSub:
503            encoding = encodingSub[encoding.lower()]
504
505        return encoding, confidence
506
507    def changeEncoding(self, newEncoding):
508        assert self.charEncoding[1] != "certain"
509        newEncoding = codecName(newEncoding)
510        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
511            newEncoding = "utf-8"
512        if newEncoding is None:
513            return
514        elif newEncoding == self.charEncoding[0]:
515            self.charEncoding = (self.charEncoding[0], "certain")
516        else:
517            self.rawStream.seek(0)
518            self.reset()
519            self.charEncoding = (newEncoding, "certain")
520            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
521
522    def detectBOM(self):
523        """Attempts to detect at BOM at the start of the stream. If
524        an encoding can be determined from the BOM return the name of the
525        encoding otherwise return None"""
526        bomDict = {
527            codecs.BOM_UTF8: 'utf-8',
528            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
529            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
530        }
531
532        # Go to beginning of file and read in 4 bytes
533        string = self.rawStream.read(4)
534        assert isinstance(string, bytes)
535
536        # Try detecting the BOM using bytes from the string
537        encoding = bomDict.get(string[:3])         # UTF-8
538        seek = 3
539        if not encoding:
540            # Need to detect UTF-32 before UTF-16
541            encoding = bomDict.get(string)         # UTF-32
542            seek = 4
543            if not encoding:
544                encoding = bomDict.get(string[:2])  # UTF-16
545                seek = 2
546
547        # Set the read position past the BOM if one was found, otherwise
548        # set it to the start of the stream
549        self.rawStream.seek(encoding and seek or 0)
550
551        return encoding
552
553    def detectEncodingMeta(self):
554        """Report the encoding declared by the meta element
555        """
556        buffer = self.rawStream.read(self.numBytesMeta)
557        assert isinstance(buffer, bytes)
558        parser = EncodingParser(buffer)
559        self.rawStream.seek(0)
560        encoding = parser.getEncoding()
561
562        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
563            encoding = "utf-8"
564
565        return encoding
566
567
568class EncodingBytes(bytes):
569    """String-like object with an associated position and various extra methods
570    If the position is ever greater than the string length then an exception is
571    raised"""
572    def __new__(self, value):
573        assert isinstance(value, bytes)
574        return bytes.__new__(self, value.lower())
575
576    def __init__(self, value):
577        self._position = -1
578
579    def __iter__(self):
580        return self
581
582    def __next__(self):
583        p = self._position = self._position + 1
584        if p >= len(self):
585            raise StopIteration
586        elif p < 0:
587            raise TypeError
588        return self[p:p + 1]
589
590    def next(self):
591        # Py2 compat
592        return self.__next__()
593
594    def previous(self):
595        p = self._position
596        if p >= len(self):
597            raise StopIteration
598        elif p < 0:
599            raise TypeError
600        self._position = p = p - 1
601        return self[p:p + 1]
602
603    def setPosition(self, position):
604        if self._position >= len(self):
605            raise StopIteration
606        self._position = position
607
608    def getPosition(self):
609        if self._position >= len(self):
610            raise StopIteration
611        if self._position >= 0:
612            return self._position
613        else:
614            return None
615
616    position = property(getPosition, setPosition)
617
618    def getCurrentByte(self):
619        return self[self.position:self.position + 1]
620
621    currentByte = property(getCurrentByte)
622
623    def skip(self, chars=spaceCharactersBytes):
624        """Skip past a list of characters"""
625        p = self.position               # use property for the error-checking
626        while p < len(self):
627            c = self[p:p + 1]
628            if c not in chars:
629                self._position = p
630                return c
631            p += 1
632        self._position = p
633        return None
634
635    def skipUntil(self, chars):
636        p = self.position
637        while p < len(self):
638            c = self[p:p + 1]
639            if c in chars:
640                self._position = p
641                return c
642            p += 1
643        self._position = p
644        return None
645
646    def matchBytes(self, bytes):
647        """Look for a sequence of bytes at the start of a string. If the bytes
648        are found return True and advance the position to the byte after the
649        match. Otherwise return False and leave the position alone"""
650        p = self.position
651        data = self[p:p + len(bytes)]
652        rv = data.startswith(bytes)
653        if rv:
654            self.position += len(bytes)
655        return rv
656
657    def jumpTo(self, bytes):
658        """Look for the next sequence of bytes matching a given sequence. If
659        a match is found advance the position to the last byte of the match"""
660        newPosition = self[self.position:].find(bytes)
661        if newPosition > -1:
662            # XXX: This is ugly, but I can't see a nicer way to fix this.
663            if self._position == -1:
664                self._position = 0
665            self._position += (newPosition + len(bytes) - 1)
666            return True
667        else:
668            raise StopIteration
669
670
671class EncodingParser(object):
672    """Mini parser for detecting character encoding from meta elements"""
673
674    def __init__(self, data):
675        """string - the data to work on for encoding detection"""
676        self.data = EncodingBytes(data)
677        self.encoding = None
678
679    def getEncoding(self):
680        methodDispatch = (
681            (b"<!--", self.handleComment),
682            (b"<meta", self.handleMeta),
683            (b"</", self.handlePossibleEndTag),
684            (b"<!", self.handleOther),
685            (b"<?", self.handleOther),
686            (b"<", self.handlePossibleStartTag))
687        for byte in self.data:
688            keepParsing = True
689            for key, method in methodDispatch:
690                if self.data.matchBytes(key):
691                    try:
692                        keepParsing = method()
693                        break
694                    except StopIteration:
695                        keepParsing = False
696                        break
697            if not keepParsing:
698                break
699
700        return self.encoding
701
702    def handleComment(self):
703        """Skip over comments"""
704        return self.data.jumpTo(b"-->")
705
706    def handleMeta(self):
707        if self.data.currentByte not in spaceCharactersBytes:
708            # if we have <meta not followed by a space so just keep going
709            return True
710        # We have a valid meta element we want to search for attributes
711        hasPragma = False
712        pendingEncoding = None
713        while True:
714            # Try to find the next attribute after the current position
715            attr = self.getAttribute()
716            if attr is None:
717                return True
718            else:
719                if attr[0] == b"http-equiv":
720                    hasPragma = attr[1] == b"content-type"
721                    if hasPragma and pendingEncoding is not None:
722                        self.encoding = pendingEncoding
723                        return False
724                elif attr[0] == b"charset":
725                    tentativeEncoding = attr[1]
726                    codec = codecName(tentativeEncoding)
727                    if codec is not None:
728                        self.encoding = codec
729                        return False
730                elif attr[0] == b"content":
731                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
732                    tentativeEncoding = contentParser.parse()
733                    if tentativeEncoding is not None:
734                        codec = codecName(tentativeEncoding)
735                        if codec is not None:
736                            if hasPragma:
737                                self.encoding = codec
738                                return False
739                            else:
740                                pendingEncoding = codec
741
742    def handlePossibleStartTag(self):
743        return self.handlePossibleTag(False)
744
745    def handlePossibleEndTag(self):
746        next(self.data)
747        return self.handlePossibleTag(True)
748
749    def handlePossibleTag(self, endTag):
750        data = self.data
751        if data.currentByte not in asciiLettersBytes:
752            # If the next byte is not an ascii letter either ignore this
753            # fragment (possible start tag case) or treat it according to
754            # handleOther
755            if endTag:
756                data.previous()
757                self.handleOther()
758            return True
759
760        c = data.skipUntil(spacesAngleBrackets)
761        if c == b"<":
762            # return to the first step in the overall "two step" algorithm
763            # reprocessing the < byte
764            data.previous()
765        else:
766            # Read all attributes
767            attr = self.getAttribute()
768            while attr is not None:
769                attr = self.getAttribute()
770        return True
771
772    def handleOther(self):
773        return self.data.jumpTo(b">")
774
775    def getAttribute(self):
776        """Return a name,value pair for the next attribute in the stream,
777        if one is found, or None"""
778        data = self.data
779        # Step 1 (skip chars)
780        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
781        assert c is None or len(c) == 1
782        # Step 2
783        if c in (b">", None):
784            return None
785        # Step 3
786        attrName = []
787        attrValue = []
788        # Step 4 attribute name
789        while True:
790            if c == b"=" and attrName:
791                break
792            elif c in spaceCharactersBytes:
793                # Step 6!
794                c = data.skip()
795                break
796            elif c in (b"/", b">"):
797                return b"".join(attrName), b""
798            elif c in asciiUppercaseBytes:
799                attrName.append(c.lower())
800            elif c is None:
801                return None
802            else:
803                attrName.append(c)
804            # Step 5
805            c = next(data)
806        # Step 7
807        if c != b"=":
808            data.previous()
809            return b"".join(attrName), b""
810        # Step 8
811        next(data)
812        # Step 9
813        c = data.skip()
814        # Step 10
815        if c in (b"'", b'"'):
816            # 10.1
817            quoteChar = c
818            while True:
819                # 10.2
820                c = next(data)
821                # 10.3
822                if c == quoteChar:
823                    next(data)
824                    return b"".join(attrName), b"".join(attrValue)
825                # 10.4
826                elif c in asciiUppercaseBytes:
827                    attrValue.append(c.lower())
828                # 10.5
829                else:
830                    attrValue.append(c)
831        elif c == b">":
832            return b"".join(attrName), b""
833        elif c in asciiUppercaseBytes:
834            attrValue.append(c.lower())
835        elif c is None:
836            return None
837        else:
838            attrValue.append(c)
839        # Step 11
840        while True:
841            c = next(data)
842            if c in spacesAngleBrackets:
843                return b"".join(attrName), b"".join(attrValue)
844            elif c in asciiUppercaseBytes:
845                attrValue.append(c.lower())
846            elif c is None:
847                return None
848            else:
849                attrValue.append(c)
850
851
852class ContentAttrParser(object):
853    def __init__(self, data):
854        assert isinstance(data, bytes)
855        self.data = data
856
857    def parse(self):
858        try:
859            # Check if the attr name is charset
860            # otherwise return
861            self.data.jumpTo(b"charset")
862            self.data.position += 1
863            self.data.skip()
864            if not self.data.currentByte == b"=":
865                # If there is no = sign keep looking for attrs
866                return None
867            self.data.position += 1
868            self.data.skip()
869            # Look for an encoding between matching quote marks
870            if self.data.currentByte in (b'"', b"'"):
871                quoteMark = self.data.currentByte
872                self.data.position += 1
873                oldPosition = self.data.position
874                if self.data.jumpTo(quoteMark):
875                    return self.data[oldPosition:self.data.position]
876                else:
877                    return None
878            else:
879                # Unquoted value
880                oldPosition = self.data.position
881                try:
882                    self.data.skipUntil(spaceCharactersBytes)
883                    return self.data[oldPosition:self.data.position]
884                except StopIteration:
885                    # Return the whole remaining value
886                    return self.data[oldPosition:]
887        except StopIteration:
888            return None
889
890
891def codecName(encoding):
892    """Return the python codec name corresponding to an encoding or None if the
893    string doesn't correspond to a valid encoding."""
894    if isinstance(encoding, bytes):
895        try:
896            encoding = encoding.decode("ascii")
897        except UnicodeDecodeError:
898            return None
899    if encoding:
900        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
901        return encodings.get(canonicalName, None)
902    else:
903        return None
904