146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom __future__ import absolute_import, division, unicode_literals
246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom six import text_type
346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom six.moves import http_client
446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangimport codecs
646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangimport re
746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom .constants import encodings, ReparseException
1046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom . import utils
1146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
1246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom io import StringIO
1346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
1446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangtry:
1546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    from io import BytesIO
1646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangexcept ImportError:
1746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    BytesIO = StringIO
1846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
1946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangtry:
2046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    from io import BufferedIOBase
2146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangexcept ImportError:
2246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    class BufferedIOBase(object):
2346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        pass
2446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
2546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang# Non-unicode versions of constants for use in the pre-parser
2646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangspaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
2746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangasciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
2846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangasciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
2946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangspacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
3046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
3146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
3246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wanginvalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
3346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
3446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangif utils.supports_lone_surrogates:
3546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    # Use one extra step of indirection and create surrogates with
3646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    # unichr. Not using this indirection would introduce an illegal
3746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    # unicode literal on platforms not supporting such lone
3846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    # surrogates.
3946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
4046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                    eval('"\\uD800-\\uDFFF"'))
4146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangelse:
4246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
4346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
4446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangnon_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
4546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
4646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
4746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
4846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
4946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
5046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                  0x10FFFE, 0x10FFFF])
5146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
5246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
5346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
5446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang# Cache for charsUntil()
5546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangcharsUntilRegEx = {}
5646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
5746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
5846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass BufferedStream(object):
5946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """Buffering for streams that do not have buffering of their own
6046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
6146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    The buffer is implemented as a list of chunks on the assumption that
6246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    joining many strings will be slow since it is O(n**2)
6346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """
6446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
6546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __init__(self, stream):
6646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.stream = stream
6746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.buffer = []
6846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.position = [-1, 0]  # chunk number, offset
6946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
7046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def tell(self):
7146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        pos = 0
7246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        for chunk in self.buffer[:self.position[0]]:
7346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            pos += len(chunk)
7446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        pos += self.position[1]
7546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return pos
7646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
7746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def seek(self, pos):
7846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert pos <= self._bufferedBytes()
7946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        offset = pos
8046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        i = 0
8146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while len(self.buffer[i]) < offset:
8246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            offset -= len(self.buffer[i])
8346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            i += 1
8446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.position = [i, offset]
8546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
8646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def read(self, bytes):
8746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if not self.buffer:
8846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return self._readStream(bytes)
8946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif (self.position[0] == len(self.buffer) and
9046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang              self.position[1] == len(self.buffer[-1])):
9146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return self._readStream(bytes)
9246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
9346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return self._readFromBuffer(bytes)
9446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
9546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def _bufferedBytes(self):
9646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return sum([len(item) for item in self.buffer])
9746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
9846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def _readStream(self, bytes):
9946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = self.stream.read(bytes)
10046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.buffer.append(data)
10146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.position[0] += 1
10246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.position[1] = len(data)
10346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return data
10446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
10546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def _readFromBuffer(self, bytes):
10646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        remainingBytes = bytes
10746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        rv = []
10846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        bufferIndex = self.position[0]
10946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        bufferOffset = self.position[1]
11046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while bufferIndex < len(self.buffer) and remainingBytes != 0:
11146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            assert remainingBytes > 0
11246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            bufferedData = self.buffer[bufferIndex]
11346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
11446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if remainingBytes <= len(bufferedData) - bufferOffset:
11546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                bytesToRead = remainingBytes
11646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.position = [bufferIndex, bufferOffset + bytesToRead]
11746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
11846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                bytesToRead = len(bufferedData) - bufferOffset
11946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.position = [bufferIndex, len(bufferedData)]
12046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                bufferIndex += 1
12146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
12246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            remainingBytes -= bytesToRead
12346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
12446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            bufferOffset = 0
12546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
12646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if remainingBytes:
12746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            rv.append(self._readStream(remainingBytes))
12846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
12946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return b"".join(rv)
13046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
13146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
13246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangdef HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
13346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    if isinstance(source, http_client.HTTPResponse):
13446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Work around Python bug #20007: read(0) closes the connection.
13546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # http://bugs.python.org/issue20007
13646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        isUnicode = False
13746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    elif hasattr(source, "read"):
13846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        isUnicode = isinstance(source.read(0), text_type)
13946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    else:
14046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        isUnicode = isinstance(source, text_type)
14146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
14246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    if isUnicode:
14346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if encoding is not None:
14446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise TypeError("Cannot explicitly set an encoding with a unicode string")
14546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
14646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return HTMLUnicodeInputStream(source)
14746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    else:
14846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
14946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
15046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
15146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass HTMLUnicodeInputStream(object):
15246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """Provides a unicode stream of characters to the HTMLTokenizer.
15346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
15446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    This class takes care of character encoding and removing or replacing
15546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    incorrect byte-sequences and also provides column and line tracking.
15646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
15746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """
15846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
15946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    _defaultChunkSize = 10240
16046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
16146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __init__(self, source):
16246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Initialises the HTMLInputStream.
16346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
16446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        HTMLInputStream(source, [encoding]) -> Normalized stream from source
16546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        for use by html5lib.
16646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
16746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        source can be either a file-object, local filename or a string.
16846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
16946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        The optional encoding parameter must be a string that indicates
17046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        the encoding.  If specified, that encoding will be used,
17146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        regardless of any BOM or later declaration (such as in a meta
17246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        element)
17346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
17446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        parseMeta - Look for a <meta> element containing encoding information
17546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
17646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
17746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
17846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if not utils.supports_lone_surrogates:
17946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Such platforms will have already checked for such
18046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # surrogate errors, so no need to do this checking.
18146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.reportCharacterErrors = None
18246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.replaceCharactersRegexp = None
18346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif len("\U0010FFFF") == 1:
18446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.reportCharacterErrors = self.characterErrorsUCS4
18546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
18646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
18746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.reportCharacterErrors = self.characterErrorsUCS2
18846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.replaceCharactersRegexp = re.compile(
18946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
19046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
19146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # List of where new lines occur
19246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.newLines = [0]
19346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
19446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.charEncoding = ("utf-8", "certain")
19546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.dataStream = self.openStream(source)
19646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
19746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.reset()
19846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
19946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def reset(self):
20046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunk = ""
20146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunkSize = 0
20246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunkOffset = 0
20346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.errors = []
20446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
20546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # number of (complete) lines in previous chunks
20646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.prevNumLines = 0
20746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # number of columns in the last line of the previous chunk
20846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.prevNumCols = 0
20946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
21046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Deal with CR LF and surrogates split over chunk boundaries
21146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self._bufferedCharacter = None
21246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
21346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def openStream(self, source):
21446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Produces a file object from source.
21546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
21646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        source can be either a file object, local filename or a string.
21746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
21846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
21946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Already a file object
22046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if hasattr(source, 'read'):
22146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            stream = source
22246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
22346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            stream = StringIO(source)
22446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
22546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return stream
22646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
22746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def _position(self, offset):
22846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        chunk = self.chunk
22946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        nLines = chunk.count('\n', 0, offset)
23046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        positionLine = self.prevNumLines + nLines
23146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        lastLinePos = chunk.rfind('\n', 0, offset)
23246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if lastLinePos == -1:
23346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            positionColumn = self.prevNumCols + offset
23446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
23546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            positionColumn = offset - (lastLinePos + 1)
23646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return (positionLine, positionColumn)
23746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
23846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def position(self):
23946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Returns (line, col) of the current position in the stream."""
24046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        line, col = self._position(self.chunkOffset)
24146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return (line + 1, col)
24246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
24346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def char(self):
24446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """ Read one character from the stream or queue if available. Return
24546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            EOF when EOF is reached.
24646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
24746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Read a new chunk from the input stream if necessary
24846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self.chunkOffset >= self.chunkSize:
24946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if not self.readChunk():
25046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return EOF
25146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
25246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        chunkOffset = self.chunkOffset
25346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        char = self.chunk[chunkOffset]
25446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunkOffset = chunkOffset + 1
25546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
25646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return char
25746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
25846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def readChunk(self, chunkSize=None):
25946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if chunkSize is None:
26046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            chunkSize = self._defaultChunkSize
26146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
26246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
26346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
26446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunk = ""
26546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunkSize = 0
26646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunkOffset = 0
26746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
26846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = self.dataStream.read(chunkSize)
26946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
27046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Deal with CR LF and surrogates broken across chunks
27146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self._bufferedCharacter:
27246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            data = self._bufferedCharacter + data
27346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self._bufferedCharacter = None
27446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif not data:
27546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # We have no more data, bye-bye stream
27646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return False
27746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
27846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if len(data) > 1:
27946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            lastv = ord(data[-1])
28046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
28146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self._bufferedCharacter = data[-1]
28246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                data = data[:-1]
28346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
28446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self.reportCharacterErrors:
28546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.reportCharacterErrors(data)
28646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
28746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Replace invalid characters
28846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Note U+0000 is dealt with in the tokenizer
28946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            data = self.replaceCharactersRegexp.sub("\ufffd", data)
29046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
29146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = data.replace("\r\n", "\n")
29246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = data.replace("\r", "\n")
29346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
29446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunk = data
29546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.chunkSize = len(data)
29646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
29746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return True
29846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
29946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def characterErrorsUCS4(self, data):
30046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        for i in range(len(invalid_unicode_re.findall(data))):
30146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.errors.append("invalid-codepoint")
30246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
30346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def characterErrorsUCS2(self, data):
30446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Someone picked the wrong compile option
30546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # You lose
30646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        skip = False
30746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        for match in invalid_unicode_re.finditer(data):
30846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if skip:
30946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                continue
31046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            codepoint = ord(match.group())
31146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            pos = match.start()
31246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Pretty sure there should be endianness issues here
31346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if utils.isSurrogatePair(data[pos:pos + 2]):
31446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # We have a surrogate pair!
31546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
31646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if char_val in non_bmp_invalid_codepoints:
31746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    self.errors.append("invalid-codepoint")
31846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                skip = True
31946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
32046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                  pos == len(data) - 1):
32146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.errors.append("invalid-codepoint")
32246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
32346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                skip = False
32446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.errors.append("invalid-codepoint")
32546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
32646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def charsUntil(self, characters, opposite=False):
32746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """ Returns a string of characters from the stream up to but not
32846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        including any character in 'characters' or EOF. 'characters' must be
32946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        a container that supports the 'in' method and iteration over its
33046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        characters.
33146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
33246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
33346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Use a cache of regexps to find the required characters
33446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        try:
33546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            chars = charsUntilRegEx[(characters, opposite)]
33646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        except KeyError:
33746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if __debug__:
33846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                for c in characters:
33946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    assert(ord(c) < 128)
34046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            regex = "".join(["\\x%02x" % ord(c) for c in characters])
34146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if not opposite:
34246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                regex = "^%s" % regex
34346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
34446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
34546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        rv = []
34646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
34746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while True:
34846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Find the longest matching prefix
34946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            m = chars.match(self.chunk, self.chunkOffset)
35046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if m is None:
35146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # If nothing matched, and it wasn't because we ran out of chunk,
35246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # then stop
35346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if self.chunkOffset != self.chunkSize:
35446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    break
35546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
35646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                end = m.end()
35746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # If not the whole chunk matched, return everything
35846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # up to the part that didn't match
35946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if end != self.chunkSize:
36046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    rv.append(self.chunk[self.chunkOffset:end])
36146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    self.chunkOffset = end
36246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    break
36346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # If the whole remainder of the chunk matched,
36446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # use it all and read the next chunk
36546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            rv.append(self.chunk[self.chunkOffset:])
36646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if not self.readChunk():
36746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # Reached EOF
36846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                break
36946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
37046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        r = "".join(rv)
37146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return r
37246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
37346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def unget(self, char):
37446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Only one character is allowed to be ungotten at once - it must
37546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # be consumed again before any further call to unget
37646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if char is not None:
37746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if self.chunkOffset == 0:
37846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # unget is called quite rarely, so it's a good idea to do
37946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # more work here if it saves a bit of work in the frequently
38046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # called char and charsUntil.
38146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # So, just prepend the ungotten character onto the current
38246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # chunk:
38346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.chunk = char + self.chunk
38446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.chunkSize += 1
38546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
38646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.chunkOffset -= 1
38746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                assert self.chunk[self.chunkOffset] == char
38846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
38946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
39046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass HTMLBinaryInputStream(HTMLUnicodeInputStream):
39146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """Provides a unicode stream of characters to the HTMLTokenizer.
39246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
39346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    This class takes care of character encoding and removing or replacing
39446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    incorrect byte-sequences and also provides column and line tracking.
39546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
39646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """
39746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
39846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
39946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Initialises the HTMLInputStream.
40046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
40146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        HTMLInputStream(source, [encoding]) -> Normalized stream from source
40246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        for use by html5lib.
40346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
40446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        source can be either a file-object, local filename or a string.
40546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
40646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        The optional encoding parameter must be a string that indicates
40746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        the encoding.  If specified, that encoding will be used,
40846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        regardless of any BOM or later declaration (such as in a meta
40946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        element)
41046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
41146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        parseMeta - Look for a <meta> element containing encoding information
41246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
41346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
41446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Raw Stream - for unicode objects this will encode to utf-8 and set
41546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        #              self.charEncoding as appropriate
41646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.rawStream = self.openStream(source)
41746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
41846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        HTMLUnicodeInputStream.__init__(self, self.rawStream)
41946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
42046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.charEncoding = (codecName(encoding), "certain")
42146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
42246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Encoding Information
42346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Number of bytes to use when looking for a meta element with
42446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # encoding information
42546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.numBytesMeta = 512
42646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Number of bytes to use when using detecting encoding using chardet
42746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.numBytesChardet = 100
42846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Encoding to use if no other information can be found
42946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.defaultEncoding = "windows-1252"
43046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
43146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Detect encoding iff no explicit "transport level" encoding is supplied
43246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if (self.charEncoding[0] is None):
43346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.charEncoding = self.detectEncoding(parseMeta, chardet)
43446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
43546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Call superclass
43646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.reset()
43746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
43846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def reset(self):
43946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
44046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                                                 'replace')
44146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        HTMLUnicodeInputStream.reset(self)
44246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
44346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def openStream(self, source):
44446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Produces a file object from source.
44546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
44646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        source can be either a file object, local filename or a string.
44746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
44846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
44946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Already a file object
45046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if hasattr(source, 'read'):
45146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            stream = source
45246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
45346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            stream = BytesIO(source)
45446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
45546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        try:
45646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            stream.seek(stream.tell())
45746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        except:
45846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            stream = BufferedStream(stream)
45946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
46046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return stream
46146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
46246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def detectEncoding(self, parseMeta=True, chardet=True):
46346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # First look for a BOM
46446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # This will also read past the BOM if present
46546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        encoding = self.detectBOM()
46646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        confidence = "certain"
46746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # If there is no BOM need to look for meta elements with encoding
46846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # information
46946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if encoding is None and parseMeta:
47046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            encoding = self.detectEncodingMeta()
47146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            confidence = "tentative"
47246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Guess with chardet, if avaliable
47346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if encoding is None and chardet:
47446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            confidence = "tentative"
47546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            try:
47646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                try:
47746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    from charade.universaldetector import UniversalDetector
47846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                except ImportError:
47946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    from chardet.universaldetector import UniversalDetector
48046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                buffers = []
48146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                detector = UniversalDetector()
48246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                while not detector.done:
48346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    buffer = self.rawStream.read(self.numBytesChardet)
48446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    assert isinstance(buffer, bytes)
48546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    if not buffer:
48646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        break
48746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    buffers.append(buffer)
48846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    detector.feed(buffer)
48946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                detector.close()
49046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                encoding = detector.result['encoding']
49146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.rawStream.seek(0)
49246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            except ImportError:
49346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                pass
49446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # If all else fails use the default encoding
49546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if encoding is None:
49646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            confidence = "tentative"
49746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            encoding = self.defaultEncoding
49846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
49946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Substitute for equivalent encodings:
50046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        encodingSub = {"iso-8859-1": "windows-1252"}
50146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
50246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if encoding.lower() in encodingSub:
50346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            encoding = encodingSub[encoding.lower()]
50446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
50546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return encoding, confidence
50646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
50746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def changeEncoding(self, newEncoding):
50846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert self.charEncoding[1] != "certain"
50946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        newEncoding = codecName(newEncoding)
51046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
51146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            newEncoding = "utf-8"
51246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if newEncoding is None:
51346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return
51446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif newEncoding == self.charEncoding[0]:
51546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.charEncoding = (self.charEncoding[0], "certain")
51646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
51746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.rawStream.seek(0)
51846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.reset()
51946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.charEncoding = (newEncoding, "certain")
52046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
52146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
52246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def detectBOM(self):
52346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Attempts to detect at BOM at the start of the stream. If
52446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        an encoding can be determined from the BOM return the name of the
52546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        encoding otherwise return None"""
52646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        bomDict = {
52746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            codecs.BOM_UTF8: 'utf-8',
52846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
52946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
53046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        }
53146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
53246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Go to beginning of file and read in 4 bytes
53346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        string = self.rawStream.read(4)
53446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert isinstance(string, bytes)
53546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
53646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Try detecting the BOM using bytes from the string
53746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        encoding = bomDict.get(string[:3])         # UTF-8
53846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        seek = 3
53946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if not encoding:
54046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Need to detect UTF-32 before UTF-16
54146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            encoding = bomDict.get(string)         # UTF-32
54246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            seek = 4
54346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if not encoding:
54446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                encoding = bomDict.get(string[:2])  # UTF-16
54546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                seek = 2
54646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
54746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Set the read position past the BOM if one was found, otherwise
54846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # set it to the start of the stream
54946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.rawStream.seek(encoding and seek or 0)
55046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
55146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return encoding
55246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
55346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def detectEncodingMeta(self):
55446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Report the encoding declared by the meta element
55546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """
55646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        buffer = self.rawStream.read(self.numBytesMeta)
55746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert isinstance(buffer, bytes)
55846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        parser = EncodingParser(buffer)
55946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.rawStream.seek(0)
56046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        encoding = parser.getEncoding()
56146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
56246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
56346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            encoding = "utf-8"
56446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
56546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return encoding
56646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
56746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
56846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass EncodingBytes(bytes):
56946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """String-like object with an associated position and various extra methods
57046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    If the position is ever greater than the string length then an exception is
57146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    raised"""
57246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __new__(self, value):
57346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert isinstance(value, bytes)
57446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return bytes.__new__(self, value.lower())
57546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
57646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __init__(self, value):
57746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self._position = -1
57846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
57946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __iter__(self):
58046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self
58146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
58246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __next__(self):
58346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        p = self._position = self._position + 1
58446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if p >= len(self):
58546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise StopIteration
58646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif p < 0:
58746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise TypeError
58846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self[p:p + 1]
58946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
59046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def next(self):
59146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Py2 compat
59246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self.__next__()
59346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
59446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def previous(self):
59546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        p = self._position
59646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if p >= len(self):
59746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise StopIteration
59846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif p < 0:
59946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise TypeError
60046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self._position = p = p - 1
60146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self[p:p + 1]
60246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
60346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def setPosition(self, position):
60446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self._position >= len(self):
60546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise StopIteration
60646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self._position = position
60746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
60846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def getPosition(self):
60946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self._position >= len(self):
61046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise StopIteration
61146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self._position >= 0:
61246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return self._position
61346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
61446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return None
61546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
61646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    position = property(getPosition, setPosition)
61746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
61846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def getCurrentByte(self):
61946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self[self.position:self.position + 1]
62046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
62146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    currentByte = property(getCurrentByte)
62246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
62346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def skip(self, chars=spaceCharactersBytes):
62446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Skip past a list of characters"""
62546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        p = self.position               # use property for the error-checking
62646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while p < len(self):
62746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            c = self[p:p + 1]
62846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if c not in chars:
62946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self._position = p
63046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return c
63146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            p += 1
63246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self._position = p
63346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return None
63446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
63546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def skipUntil(self, chars):
63646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        p = self.position
63746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while p < len(self):
63846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            c = self[p:p + 1]
63946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if c in chars:
64046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self._position = p
64146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return c
64246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            p += 1
64346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self._position = p
64446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return None
64546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
64646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def matchBytes(self, bytes):
64746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Look for a sequence of bytes at the start of a string. If the bytes
64846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        are found return True and advance the position to the byte after the
64946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        match. Otherwise return False and leave the position alone"""
65046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        p = self.position
65146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = self[p:p + len(bytes)]
65246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        rv = data.startswith(bytes)
65346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if rv:
65446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.position += len(bytes)
65546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return rv
65646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
65746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def jumpTo(self, bytes):
65846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Look for the next sequence of bytes matching a given sequence. If
65946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        a match is found advance the position to the last byte of the match"""
66046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        newPosition = self[self.position:].find(bytes)
66146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if newPosition > -1:
66246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # XXX: This is ugly, but I can't see a nicer way to fix this.
66346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if self._position == -1:
66446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self._position = 0
66546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self._position += (newPosition + len(bytes) - 1)
66646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return True
66746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
66846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            raise StopIteration
66946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
67046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
67146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass EncodingParser(object):
67246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """Mini parser for detecting character encoding from meta elements"""
67346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
67446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __init__(self, data):
67546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """string - the data to work on for encoding detection"""
67646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.data = EncodingBytes(data)
67746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.encoding = None
67846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
67946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def getEncoding(self):
68046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        methodDispatch = (
68146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            (b"<!--", self.handleComment),
68246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            (b"<meta", self.handleMeta),
68346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            (b"</", self.handlePossibleEndTag),
68446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            (b"<!", self.handleOther),
68546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            (b"<?", self.handleOther),
68646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            (b"<", self.handlePossibleStartTag))
68746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        for byte in self.data:
68846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            keepParsing = True
68946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            for key, method in methodDispatch:
69046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if self.data.matchBytes(key):
69146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    try:
69246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        keepParsing = method()
69346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        break
69446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    except StopIteration:
69546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        keepParsing = False
69646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        break
69746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if not keepParsing:
69846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                break
69946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
70046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self.encoding
70146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
70246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def handleComment(self):
70346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Skip over comments"""
70446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self.data.jumpTo(b"-->")
70546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
70646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def handleMeta(self):
70746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if self.data.currentByte not in spaceCharactersBytes:
70846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # if we have <meta not followed by a space so just keep going
70946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return True
71046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # We have a valid meta element we want to search for attributes
71146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        hasPragma = False
71246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        pendingEncoding = None
71346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while True:
71446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Try to find the next attribute after the current position
71546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            attr = self.getAttribute()
71646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if attr is None:
71746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return True
71846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
71946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if attr[0] == b"http-equiv":
72046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    hasPragma = attr[1] == b"content-type"
72146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    if hasPragma and pendingEncoding is not None:
72246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        self.encoding = pendingEncoding
72346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        return False
72446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                elif attr[0] == b"charset":
72546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    tentativeEncoding = attr[1]
72646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    codec = codecName(tentativeEncoding)
72746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    if codec is not None:
72846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        self.encoding = codec
72946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        return False
73046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                elif attr[0] == b"content":
73146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
73246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    tentativeEncoding = contentParser.parse()
73346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    if tentativeEncoding is not None:
73446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        codec = codecName(tentativeEncoding)
73546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                        if codec is not None:
73646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                            if hasPragma:
73746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                self.encoding = codec
73846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                return False
73946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                            else:
74046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                                pendingEncoding = codec
74146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
74246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def handlePossibleStartTag(self):
74346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self.handlePossibleTag(False)
74446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
74546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def handlePossibleEndTag(self):
74646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        next(self.data)
74746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self.handlePossibleTag(True)
74846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
74946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def handlePossibleTag(self, endTag):
75046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = self.data
75146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if data.currentByte not in asciiLettersBytes:
75246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # If the next byte is not an ascii letter either ignore this
75346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # fragment (possible start tag case) or treat it according to
75446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # handleOther
75546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if endTag:
75646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                data.previous()
75746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.handleOther()
75846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return True
75946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
76046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        c = data.skipUntil(spacesAngleBrackets)
76146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if c == b"<":
76246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # return to the first step in the overall "two step" algorithm
76346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # reprocessing the < byte
76446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            data.previous()
76546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
76646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Read all attributes
76746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            attr = self.getAttribute()
76846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            while attr is not None:
76946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                attr = self.getAttribute()
77046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return True
77146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
77246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def handleOther(self):
77346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return self.data.jumpTo(b">")
77446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
77546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def getAttribute(self):
77646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        """Return a name,value pair for the next attribute in the stream,
77746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if one is found, or None"""
77846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        data = self.data
77946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 1 (skip chars)
78046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
78146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert c is None or len(c) == 1
78246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 2
78346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if c in (b">", None):
78446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return None
78546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 3
78646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        attrName = []
78746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        attrValue = []
78846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 4 attribute name
78946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while True:
79046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if c == b"=" and attrName:
79146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                break
79246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif c in spaceCharactersBytes:
79346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # Step 6!
79446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                c = data.skip()
79546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                break
79646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif c in (b"/", b">"):
79746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return b"".join(attrName), b""
79846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif c in asciiUppercaseBytes:
79946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                attrName.append(c.lower())
80046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif c is None:
80146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return None
80246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
80346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                attrName.append(c)
80446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Step 5
80546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            c = next(data)
80646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 7
80746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if c != b"=":
80846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            data.previous()
80946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return b"".join(attrName), b""
81046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 8
81146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        next(data)
81246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 9
81346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        c = data.skip()
81446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 10
81546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        if c in (b"'", b'"'):
81646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # 10.1
81746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            quoteChar = c
81846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            while True:
81946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # 10.2
82046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                c = next(data)
82146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # 10.3
82246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if c == quoteChar:
82346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    next(data)
82446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    return b"".join(attrName), b"".join(attrValue)
82546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # 10.4
82646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                elif c in asciiUppercaseBytes:
82746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    attrValue.append(c.lower())
82846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # 10.5
82946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                else:
83046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    attrValue.append(c)
83146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif c == b">":
83246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return b"".join(attrName), b""
83346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif c in asciiUppercaseBytes:
83446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            attrValue.append(c.lower())
83546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        elif c is None:
83646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return None
83746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        else:
83846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            attrValue.append(c)
83946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        # Step 11
84046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        while True:
84146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            c = next(data)
84246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if c in spacesAngleBrackets:
84346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return b"".join(attrName), b"".join(attrValue)
84446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif c in asciiUppercaseBytes:
84546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                attrValue.append(c.lower())
84646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            elif c is None:
84746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return None
84846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
84946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                attrValue.append(c)
85046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
85146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
85246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass ContentAttrParser(object):
85346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def __init__(self, data):
85446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        assert isinstance(data, bytes)
85546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        self.data = data
85646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
85746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    def parse(self):
85846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        try:
85946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Check if the attr name is charset
86046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # otherwise return
86146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.data.jumpTo(b"charset")
86246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.data.position += 1
86346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.data.skip()
86446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if not self.data.currentByte == b"=":
86546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # If there is no = sign keep looking for attrs
86646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                return None
86746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.data.position += 1
86846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            self.data.skip()
86946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            # Look for an encoding between matching quote marks
87046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            if self.data.currentByte in (b'"', b"'"):
87146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                quoteMark = self.data.currentByte
87246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                self.data.position += 1
87346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                oldPosition = self.data.position
87446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                if self.data.jumpTo(quoteMark):
87546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    return self.data[oldPosition:self.data.position]
87646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                else:
87746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    return None
87846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            else:
87946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                # Unquoted value
88046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                oldPosition = self.data.position
88146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                try:
88246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    self.data.skipUntil(spaceCharactersBytes)
88346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    return self.data[oldPosition:self.data.position]
88446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                except StopIteration:
88546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    # Return the whole remaining value
88646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang                    return self.data[oldPosition:]
88746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        except StopIteration:
88846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return None
88946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
89046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang
89146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangdef codecName(encoding):
89246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    """Return the python codec name corresponding to an encoding or None if the
89346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    string doesn't correspond to a valid encoding."""
89446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    if isinstance(encoding, bytes):
89546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        try:
89646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            encoding = encoding.decode("ascii")
89746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        except UnicodeDecodeError:
89846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang            return None
89946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    if encoding:
90046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
90146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return encodings.get(canonicalName, None)
90246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang    else:
90346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang        return None
904