146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom __future__ import absolute_import, division, unicode_literals 246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom six import text_type 346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom six.moves import http_client 446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangimport codecs 646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangimport re 746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase 946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom .constants import encodings, ReparseException 1046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom . import utils 1146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 1246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangfrom io import StringIO 1346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 1446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangtry: 1546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang from io import BytesIO 1646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangexcept ImportError: 1746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang BytesIO = StringIO 1846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 1946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangtry: 2046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang from io import BufferedIOBase 2146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangexcept ImportError: 2246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang class BufferedIOBase(object): 2346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pass 2446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 2546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang# Non-unicode versions of constants for use in the pre-parser 2646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangspaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) 2746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangasciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) 2846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangasciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) 2946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangspacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) 3046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 3146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 3246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wanginvalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" 3346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 3446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangif utils.supports_lone_surrogates: 3546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Use one extra step of indirection and create surrogates with 3646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # unichr. Not using this indirection would introduce an illegal 3746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # unicode literal on platforms not supporting such lone 3846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # surrogates. 3946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + 4046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang eval('"\\uD800-\\uDFFF"')) 4146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangelse: 4246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) 4346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 4446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangnon_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 4546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 4646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 4746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 4846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 4946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 5046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 0x10FFFE, 0x10FFFF]) 5146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 5246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") 5346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 5446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang# Cache for charsUntil() 5546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen WangcharsUntilRegEx = {} 5646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 5746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 5846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass BufferedStream(object): 5946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Buffering for streams that do not have buffering of their own 6046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 6146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang The buffer is implemented as a list of chunks on the assumption that 6246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang joining many strings will be slow since it is O(n**2) 6346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 6446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 6546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __init__(self, stream): 6646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.stream = stream 6746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.buffer = [] 6846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position = [-1, 0] # chunk number, offset 6946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 7046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def tell(self): 7146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pos = 0 7246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for chunk in self.buffer[:self.position[0]]: 7346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pos += len(chunk) 7446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pos += self.position[1] 7546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return pos 7646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 7746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def seek(self, pos): 7846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert pos <= self._bufferedBytes() 7946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang offset = pos 8046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang i = 0 8146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while len(self.buffer[i]) < offset: 8246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang offset -= len(self.buffer[i]) 8346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang i += 1 8446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position = [i, offset] 8546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 8646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def read(self, bytes): 8746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not self.buffer: 8846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self._readStream(bytes) 8946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif (self.position[0] == len(self.buffer) and 9046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position[1] == len(self.buffer[-1])): 9146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self._readStream(bytes) 9246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 9346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self._readFromBuffer(bytes) 9446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 9546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def _bufferedBytes(self): 9646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return sum([len(item) for item in self.buffer]) 9746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 9846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def _readStream(self, bytes): 9946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self.stream.read(bytes) 10046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.buffer.append(data) 10146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position[0] += 1 10246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position[1] = len(data) 10346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return data 10446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 10546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def _readFromBuffer(self, bytes): 10646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang remainingBytes = bytes 10746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv = [] 10846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bufferIndex = self.position[0] 10946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bufferOffset = self.position[1] 11046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while bufferIndex < len(self.buffer) and remainingBytes != 0: 11146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert remainingBytes > 0 11246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bufferedData = self.buffer[bufferIndex] 11346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 11446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if remainingBytes <= len(bufferedData) - bufferOffset: 11546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bytesToRead = remainingBytes 11646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position = [bufferIndex, bufferOffset + bytesToRead] 11746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 11846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bytesToRead = len(bufferedData) - bufferOffset 11946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position = [bufferIndex, len(bufferedData)] 12046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bufferIndex += 1 12146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead]) 12246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang remainingBytes -= bytesToRead 12346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 12446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bufferOffset = 0 12546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 12646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if remainingBytes: 12746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv.append(self._readStream(remainingBytes)) 12846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 12946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return b"".join(rv) 13046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 13146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 13246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangdef HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): 13346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if isinstance(source, http_client.HTTPResponse): 13446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Work around Python bug #20007: read(0) closes the connection. 13546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # http://bugs.python.org/issue20007 13646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang isUnicode = False 13746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif hasattr(source, "read"): 13846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang isUnicode = isinstance(source.read(0), text_type) 13946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 14046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang isUnicode = isinstance(source, text_type) 14146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 14246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if isUnicode: 14346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding is not None: 14446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise TypeError("Cannot explicitly set an encoding with a unicode string") 14546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 14646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return HTMLUnicodeInputStream(source) 14746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 14846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) 14946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 15046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 15146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass HTMLUnicodeInputStream(object): 15246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Provides a unicode stream of characters to the HTMLTokenizer. 15346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 15446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang This class takes care of character encoding and removing or replacing 15546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang incorrect byte-sequences and also provides column and line tracking. 15646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 15746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 15846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 15946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang _defaultChunkSize = 10240 16046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 16146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __init__(self, source): 16246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Initialises the HTMLInputStream. 16346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 16446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang HTMLInputStream(source, [encoding]) -> Normalized stream from source 16546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for use by html5lib. 16646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 16746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang source can be either a file-object, local filename or a string. 16846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 16946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang The optional encoding parameter must be a string that indicates 17046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang the encoding. If specified, that encoding will be used, 17146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang regardless of any BOM or later declaration (such as in a meta 17246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang element) 17346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 17446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang parseMeta - Look for a <meta> element containing encoding information 17546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 17646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 17746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 17846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not utils.supports_lone_surrogates: 17946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Such platforms will have already checked for such 18046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # surrogate errors, so no need to do this checking. 18146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reportCharacterErrors = None 18246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.replaceCharactersRegexp = None 18346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif len("\U0010FFFF") == 1: 18446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reportCharacterErrors = self.characterErrorsUCS4 18546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"')) 18646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 18746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reportCharacterErrors = self.characterErrorsUCS2 18846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.replaceCharactersRegexp = re.compile( 18946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"')) 19046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 19146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # List of where new lines occur 19246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.newLines = [0] 19346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 19446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.charEncoding = ("utf-8", "certain") 19546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.dataStream = self.openStream(source) 19646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 19746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reset() 19846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 19946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def reset(self): 20046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunk = "" 20146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkSize = 0 20246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkOffset = 0 20346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.errors = [] 20446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 20546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # number of (complete) lines in previous chunks 20646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.prevNumLines = 0 20746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # number of columns in the last line of the previous chunk 20846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.prevNumCols = 0 20946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 21046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Deal with CR LF and surrogates split over chunk boundaries 21146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._bufferedCharacter = None 21246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 21346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def openStream(self, source): 21446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Produces a file object from source. 21546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 21646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang source can be either a file object, local filename or a string. 21746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 21846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 21946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Already a file object 22046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if hasattr(source, 'read'): 22146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang stream = source 22246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 22346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang stream = StringIO(source) 22446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 22546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return stream 22646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 22746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def _position(self, offset): 22846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang chunk = self.chunk 22946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang nLines = chunk.count('\n', 0, offset) 23046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang positionLine = self.prevNumLines + nLines 23146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang lastLinePos = chunk.rfind('\n', 0, offset) 23246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if lastLinePos == -1: 23346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang positionColumn = self.prevNumCols + offset 23446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 23546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang positionColumn = offset - (lastLinePos + 1) 23646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return (positionLine, positionColumn) 23746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 23846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def position(self): 23946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Returns (line, col) of the current position in the stream.""" 24046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang line, col = self._position(self.chunkOffset) 24146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return (line + 1, col) 24246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 24346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def char(self): 24446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ Read one character from the stream or queue if available. Return 24546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang EOF when EOF is reached. 24646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 24746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Read a new chunk from the input stream if necessary 24846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.chunkOffset >= self.chunkSize: 24946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not self.readChunk(): 25046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return EOF 25146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 25246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang chunkOffset = self.chunkOffset 25346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang char = self.chunk[chunkOffset] 25446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkOffset = chunkOffset + 1 25546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 25646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return char 25746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 25846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def readChunk(self, chunkSize=None): 25946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if chunkSize is None: 26046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang chunkSize = self._defaultChunkSize 26146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 26246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.prevNumLines, self.prevNumCols = self._position(self.chunkSize) 26346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 26446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunk = "" 26546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkSize = 0 26646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkOffset = 0 26746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 26846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self.dataStream.read(chunkSize) 26946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 27046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Deal with CR LF and surrogates broken across chunks 27146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self._bufferedCharacter: 27246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self._bufferedCharacter + data 27346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._bufferedCharacter = None 27446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif not data: 27546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # We have no more data, bye-bye stream 27646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return False 27746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 27846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if len(data) > 1: 27946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang lastv = ord(data[-1]) 28046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF: 28146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._bufferedCharacter = data[-1] 28246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = data[:-1] 28346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 28446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.reportCharacterErrors: 28546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reportCharacterErrors(data) 28646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 28746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Replace invalid characters 28846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Note U+0000 is dealt with in the tokenizer 28946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self.replaceCharactersRegexp.sub("\ufffd", data) 29046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 29146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = data.replace("\r\n", "\n") 29246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = data.replace("\r", "\n") 29346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 29446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunk = data 29546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkSize = len(data) 29646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 29746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return True 29846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 29946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def characterErrorsUCS4(self, data): 30046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for i in range(len(invalid_unicode_re.findall(data))): 30146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.errors.append("invalid-codepoint") 30246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 30346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def characterErrorsUCS2(self, data): 30446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Someone picked the wrong compile option 30546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # You lose 30646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang skip = False 30746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for match in invalid_unicode_re.finditer(data): 30846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if skip: 30946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang continue 31046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang codepoint = ord(match.group()) 31146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pos = match.start() 31246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Pretty sure there should be endianness issues here 31346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if utils.isSurrogatePair(data[pos:pos + 2]): 31446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # We have a surrogate pair! 31546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2]) 31646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if char_val in non_bmp_invalid_codepoints: 31746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.errors.append("invalid-codepoint") 31846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang skip = True 31946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and 32046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pos == len(data) - 1): 32146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.errors.append("invalid-codepoint") 32246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 32346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang skip = False 32446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.errors.append("invalid-codepoint") 32546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 32646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def charsUntil(self, characters, opposite=False): 32746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ Returns a string of characters from the stream up to but not 32846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang including any character in 'characters' or EOF. 'characters' must be 32946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang a container that supports the 'in' method and iteration over its 33046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang characters. 33146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 33246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 33346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Use a cache of regexps to find the required characters 33446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 33546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang chars = charsUntilRegEx[(characters, opposite)] 33646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except KeyError: 33746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if __debug__: 33846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for c in characters: 33946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert(ord(c) < 128) 34046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang regex = "".join(["\\x%02x" % ord(c) for c in characters]) 34146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not opposite: 34246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang regex = "^%s" % regex 34346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex) 34446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 34546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv = [] 34646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 34746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while True: 34846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Find the longest matching prefix 34946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang m = chars.match(self.chunk, self.chunkOffset) 35046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if m is None: 35146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If nothing matched, and it wasn't because we ran out of chunk, 35246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # then stop 35346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.chunkOffset != self.chunkSize: 35446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 35546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 35646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang end = m.end() 35746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If not the whole chunk matched, return everything 35846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # up to the part that didn't match 35946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if end != self.chunkSize: 36046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv.append(self.chunk[self.chunkOffset:end]) 36146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkOffset = end 36246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 36346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If the whole remainder of the chunk matched, 36446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # use it all and read the next chunk 36546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv.append(self.chunk[self.chunkOffset:]) 36646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not self.readChunk(): 36746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Reached EOF 36846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 36946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 37046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang r = "".join(rv) 37146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return r 37246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 37346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def unget(self, char): 37446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Only one character is allowed to be ungotten at once - it must 37546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # be consumed again before any further call to unget 37646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if char is not None: 37746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.chunkOffset == 0: 37846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # unget is called quite rarely, so it's a good idea to do 37946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # more work here if it saves a bit of work in the frequently 38046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # called char and charsUntil. 38146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # So, just prepend the ungotten character onto the current 38246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # chunk: 38346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunk = char + self.chunk 38446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkSize += 1 38546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 38646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.chunkOffset -= 1 38746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert self.chunk[self.chunkOffset] == char 38846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 38946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 39046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass HTMLBinaryInputStream(HTMLUnicodeInputStream): 39146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Provides a unicode stream of characters to the HTMLTokenizer. 39246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 39346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang This class takes care of character encoding and removing or replacing 39446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang incorrect byte-sequences and also provides column and line tracking. 39546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 39646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 39746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 39846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __init__(self, source, encoding=None, parseMeta=True, chardet=True): 39946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Initialises the HTMLInputStream. 40046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 40146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang HTMLInputStream(source, [encoding]) -> Normalized stream from source 40246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for use by html5lib. 40346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 40446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang source can be either a file-object, local filename or a string. 40546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 40646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang The optional encoding parameter must be a string that indicates 40746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang the encoding. If specified, that encoding will be used, 40846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang regardless of any BOM or later declaration (such as in a meta 40946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang element) 41046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 41146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang parseMeta - Look for a <meta> element containing encoding information 41246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 41346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 41446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Raw Stream - for unicode objects this will encode to utf-8 and set 41546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # self.charEncoding as appropriate 41646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.rawStream = self.openStream(source) 41746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 41846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang HTMLUnicodeInputStream.__init__(self, self.rawStream) 41946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 42046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.charEncoding = (codecName(encoding), "certain") 42146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 42246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Encoding Information 42346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Number of bytes to use when looking for a meta element with 42446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # encoding information 42546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.numBytesMeta = 512 42646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Number of bytes to use when using detecting encoding using chardet 42746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.numBytesChardet = 100 42846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Encoding to use if no other information can be found 42946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.defaultEncoding = "windows-1252" 43046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 43146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Detect encoding iff no explicit "transport level" encoding is supplied 43246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if (self.charEncoding[0] is None): 43346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.charEncoding = self.detectEncoding(parseMeta, chardet) 43446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 43546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Call superclass 43646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reset() 43746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 43846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def reset(self): 43946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, 44046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 'replace') 44146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang HTMLUnicodeInputStream.reset(self) 44246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 44346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def openStream(self, source): 44446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Produces a file object from source. 44546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 44646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang source can be either a file object, local filename or a string. 44746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 44846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 44946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Already a file object 45046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if hasattr(source, 'read'): 45146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang stream = source 45246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 45346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang stream = BytesIO(source) 45446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 45546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 45646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang stream.seek(stream.tell()) 45746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except: 45846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang stream = BufferedStream(stream) 45946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 46046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return stream 46146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 46246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def detectEncoding(self, parseMeta=True, chardet=True): 46346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # First look for a BOM 46446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # This will also read past the BOM if present 46546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = self.detectBOM() 46646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang confidence = "certain" 46746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If there is no BOM need to look for meta elements with encoding 46846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # information 46946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding is None and parseMeta: 47046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = self.detectEncodingMeta() 47146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang confidence = "tentative" 47246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Guess with chardet, if avaliable 47346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding is None and chardet: 47446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang confidence = "tentative" 47546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 47646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 47746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang from charade.universaldetector import UniversalDetector 47846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except ImportError: 47946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang from chardet.universaldetector import UniversalDetector 48046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang buffers = [] 48146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang detector = UniversalDetector() 48246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while not detector.done: 48346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang buffer = self.rawStream.read(self.numBytesChardet) 48446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert isinstance(buffer, bytes) 48546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not buffer: 48646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 48746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang buffers.append(buffer) 48846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang detector.feed(buffer) 48946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang detector.close() 49046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = detector.result['encoding'] 49146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.rawStream.seek(0) 49246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except ImportError: 49346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pass 49446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If all else fails use the default encoding 49546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding is None: 49646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang confidence = "tentative" 49746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = self.defaultEncoding 49846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 49946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Substitute for equivalent encodings: 50046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encodingSub = {"iso-8859-1": "windows-1252"} 50146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 50246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding.lower() in encodingSub: 50346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = encodingSub[encoding.lower()] 50446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 50546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return encoding, confidence 50646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 50746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def changeEncoding(self, newEncoding): 50846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert self.charEncoding[1] != "certain" 50946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang newEncoding = codecName(newEncoding) 51046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"): 51146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang newEncoding = "utf-8" 51246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if newEncoding is None: 51346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return 51446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif newEncoding == self.charEncoding[0]: 51546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.charEncoding = (self.charEncoding[0], "certain") 51646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 51746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.rawStream.seek(0) 51846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.reset() 51946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.charEncoding = (newEncoding, "certain") 52046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) 52146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 52246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def detectBOM(self): 52346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Attempts to detect at BOM at the start of the stream. If 52446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang an encoding can be determined from the BOM return the name of the 52546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding otherwise return None""" 52646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang bomDict = { 52746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang codecs.BOM_UTF8: 'utf-8', 52846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', 52946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' 53046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang } 53146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 53246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Go to beginning of file and read in 4 bytes 53346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang string = self.rawStream.read(4) 53446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert isinstance(string, bytes) 53546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 53646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Try detecting the BOM using bytes from the string 53746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = bomDict.get(string[:3]) # UTF-8 53846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang seek = 3 53946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not encoding: 54046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Need to detect UTF-32 before UTF-16 54146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = bomDict.get(string) # UTF-32 54246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang seek = 4 54346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not encoding: 54446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = bomDict.get(string[:2]) # UTF-16 54546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang seek = 2 54646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 54746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Set the read position past the BOM if one was found, otherwise 54846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # set it to the start of the stream 54946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.rawStream.seek(encoding and seek or 0) 55046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 55146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return encoding 55246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 55346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def detectEncodingMeta(self): 55446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Report the encoding declared by the meta element 55546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """ 55646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang buffer = self.rawStream.read(self.numBytesMeta) 55746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert isinstance(buffer, bytes) 55846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang parser = EncodingParser(buffer) 55946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.rawStream.seek(0) 56046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = parser.getEncoding() 56146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 56246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding in ("utf-16", "utf-16-be", "utf-16-le"): 56346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = "utf-8" 56446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 56546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return encoding 56646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 56746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 56846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass EncodingBytes(bytes): 56946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """String-like object with an associated position and various extra methods 57046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang If the position is ever greater than the string length then an exception is 57146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raised""" 57246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __new__(self, value): 57346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert isinstance(value, bytes) 57446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return bytes.__new__(self, value.lower()) 57546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 57646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __init__(self, value): 57746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = -1 57846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 57946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __iter__(self): 58046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self 58146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 58246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __next__(self): 58346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p = self._position = self._position + 1 58446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if p >= len(self): 58546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise StopIteration 58646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif p < 0: 58746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise TypeError 58846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self[p:p + 1] 58946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 59046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def next(self): 59146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Py2 compat 59246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.__next__() 59346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 59446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def previous(self): 59546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p = self._position 59646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if p >= len(self): 59746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise StopIteration 59846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif p < 0: 59946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise TypeError 60046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = p = p - 1 60146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self[p:p + 1] 60246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 60346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def setPosition(self, position): 60446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self._position >= len(self): 60546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise StopIteration 60646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = position 60746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 60846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def getPosition(self): 60946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self._position >= len(self): 61046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise StopIteration 61146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self._position >= 0: 61246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self._position 61346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 61446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 61546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 61646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang position = property(getPosition, setPosition) 61746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 61846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def getCurrentByte(self): 61946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self[self.position:self.position + 1] 62046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 62146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang currentByte = property(getCurrentByte) 62246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 62346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def skip(self, chars=spaceCharactersBytes): 62446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Skip past a list of characters""" 62546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p = self.position # use property for the error-checking 62646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while p < len(self): 62746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = self[p:p + 1] 62846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c not in chars: 62946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = p 63046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return c 63146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p += 1 63246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = p 63346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 63446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 63546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def skipUntil(self, chars): 63646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p = self.position 63746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while p < len(self): 63846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = self[p:p + 1] 63946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c in chars: 64046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = p 64146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return c 64246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p += 1 64346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = p 64446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 64546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 64646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def matchBytes(self, bytes): 64746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Look for a sequence of bytes at the start of a string. If the bytes 64846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang are found return True and advance the position to the byte after the 64946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang match. Otherwise return False and leave the position alone""" 65046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang p = self.position 65146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self[p:p + len(bytes)] 65246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang rv = data.startswith(bytes) 65346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if rv: 65446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.position += len(bytes) 65546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return rv 65646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 65746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def jumpTo(self, bytes): 65846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Look for the next sequence of bytes matching a given sequence. If 65946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang a match is found advance the position to the last byte of the match""" 66046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang newPosition = self[self.position:].find(bytes) 66146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if newPosition > -1: 66246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # XXX: This is ugly, but I can't see a nicer way to fix this. 66346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self._position == -1: 66446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position = 0 66546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self._position += (newPosition + len(bytes) - 1) 66646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return True 66746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 66846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang raise StopIteration 66946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 67046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 67146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass EncodingParser(object): 67246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Mini parser for detecting character encoding from meta elements""" 67346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 67446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __init__(self, data): 67546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """string - the data to work on for encoding detection""" 67646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data = EncodingBytes(data) 67746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.encoding = None 67846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 67946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def getEncoding(self): 68046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang methodDispatch = ( 68146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang (b"<!--", self.handleComment), 68246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang (b"<meta", self.handleMeta), 68346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang (b"</", self.handlePossibleEndTag), 68446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang (b"<!", self.handleOther), 68546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang (b"<?", self.handleOther), 68646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang (b"<", self.handlePossibleStartTag)) 68746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for byte in self.data: 68846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang keepParsing = True 68946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang for key, method in methodDispatch: 69046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.data.matchBytes(key): 69146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 69246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang keepParsing = method() 69346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 69446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except StopIteration: 69546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang keepParsing = False 69646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 69746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not keepParsing: 69846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 69946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 70046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.encoding 70146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 70246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def handleComment(self): 70346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Skip over comments""" 70446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.data.jumpTo(b"-->") 70546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 70646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def handleMeta(self): 70746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.data.currentByte not in spaceCharactersBytes: 70846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # if we have <meta not followed by a space so just keep going 70946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return True 71046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # We have a valid meta element we want to search for attributes 71146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang hasPragma = False 71246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pendingEncoding = None 71346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while True: 71446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Try to find the next attribute after the current position 71546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attr = self.getAttribute() 71646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if attr is None: 71746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return True 71846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 71946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if attr[0] == b"http-equiv": 72046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang hasPragma = attr[1] == b"content-type" 72146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if hasPragma and pendingEncoding is not None: 72246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.encoding = pendingEncoding 72346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return False 72446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif attr[0] == b"charset": 72546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang tentativeEncoding = attr[1] 72646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang codec = codecName(tentativeEncoding) 72746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if codec is not None: 72846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.encoding = codec 72946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return False 73046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif attr[0] == b"content": 73146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang contentParser = ContentAttrParser(EncodingBytes(attr[1])) 73246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang tentativeEncoding = contentParser.parse() 73346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if tentativeEncoding is not None: 73446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang codec = codecName(tentativeEncoding) 73546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if codec is not None: 73646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if hasPragma: 73746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.encoding = codec 73846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return False 73946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 74046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang pendingEncoding = codec 74146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 74246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def handlePossibleStartTag(self): 74346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.handlePossibleTag(False) 74446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 74546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def handlePossibleEndTag(self): 74646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang next(self.data) 74746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.handlePossibleTag(True) 74846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 74946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def handlePossibleTag(self, endTag): 75046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self.data 75146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if data.currentByte not in asciiLettersBytes: 75246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If the next byte is not an ascii letter either ignore this 75346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # fragment (possible start tag case) or treat it according to 75446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # handleOther 75546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if endTag: 75646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data.previous() 75746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.handleOther() 75846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return True 75946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 76046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = data.skipUntil(spacesAngleBrackets) 76146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c == b"<": 76246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # return to the first step in the overall "two step" algorithm 76346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # reprocessing the < byte 76446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data.previous() 76546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 76646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Read all attributes 76746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attr = self.getAttribute() 76846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while attr is not None: 76946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attr = self.getAttribute() 77046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return True 77146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 77246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def handleOther(self): 77346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.data.jumpTo(b">") 77446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 77546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def getAttribute(self): 77646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Return a name,value pair for the next attribute in the stream, 77746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if one is found, or None""" 77846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data = self.data 77946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 1 (skip chars) 78046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = data.skip(spaceCharactersBytes | frozenset([b"/"])) 78146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert c is None or len(c) == 1 78246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 2 78346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c in (b">", None): 78446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 78546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 3 78646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrName = [] 78746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue = [] 78846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 4 attribute name 78946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while True: 79046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c == b"=" and attrName: 79146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 79246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c in spaceCharactersBytes: 79346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 6! 79446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = data.skip() 79546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang break 79646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c in (b"/", b">"): 79746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return b"".join(attrName), b"" 79846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c in asciiUppercaseBytes: 79946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrName.append(c.lower()) 80046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c is None: 80146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 80246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 80346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrName.append(c) 80446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 5 80546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = next(data) 80646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 7 80746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c != b"=": 80846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang data.previous() 80946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return b"".join(attrName), b"" 81046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 8 81146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang next(data) 81246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 9 81346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = data.skip() 81446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 10 81546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c in (b"'", b'"'): 81646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # 10.1 81746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang quoteChar = c 81846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while True: 81946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # 10.2 82046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = next(data) 82146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # 10.3 82246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c == quoteChar: 82346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang next(data) 82446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return b"".join(attrName), b"".join(attrValue) 82546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # 10.4 82646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c in asciiUppercaseBytes: 82746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue.append(c.lower()) 82846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # 10.5 82946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 83046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue.append(c) 83146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c == b">": 83246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return b"".join(attrName), b"" 83346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c in asciiUppercaseBytes: 83446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue.append(c.lower()) 83546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c is None: 83646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 83746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 83846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue.append(c) 83946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Step 11 84046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang while True: 84146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang c = next(data) 84246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if c in spacesAngleBrackets: 84346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return b"".join(attrName), b"".join(attrValue) 84446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c in asciiUppercaseBytes: 84546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue.append(c.lower()) 84646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang elif c is None: 84746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 84846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 84946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang attrValue.append(c) 85046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 85146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 85246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangclass ContentAttrParser(object): 85346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def __init__(self, data): 85446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang assert isinstance(data, bytes) 85546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data = data 85646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 85746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang def parse(self): 85846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 85946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Check if the attr name is charset 86046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # otherwise return 86146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.jumpTo(b"charset") 86246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.position += 1 86346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.skip() 86446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if not self.data.currentByte == b"=": 86546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # If there is no = sign keep looking for attrs 86646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 86746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.position += 1 86846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.skip() 86946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Look for an encoding between matching quote marks 87046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.data.currentByte in (b'"', b"'"): 87146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang quoteMark = self.data.currentByte 87246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.position += 1 87346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang oldPosition = self.data.position 87446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if self.data.jumpTo(quoteMark): 87546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.data[oldPosition:self.data.position] 87646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 87746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 87846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 87946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Unquoted value 88046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang oldPosition = self.data.position 88146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 88246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang self.data.skipUntil(spaceCharactersBytes) 88346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.data[oldPosition:self.data.position] 88446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except StopIteration: 88546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang # Return the whole remaining value 88646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return self.data[oldPosition:] 88746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except StopIteration: 88846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 88946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 89046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang 89146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wangdef codecName(encoding): 89246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang """Return the python codec name corresponding to an encoding or None if the 89346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang string doesn't correspond to a valid encoding.""" 89446b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if isinstance(encoding, bytes): 89546b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang try: 89646b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang encoding = encoding.decode("ascii") 89746b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang except UnicodeDecodeError: 89846b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 89946b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang if encoding: 90046b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang canonicalName = ascii_punctuation_re.sub("", encoding).lower() 90146b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return encodings.get(canonicalName, None) 90246b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang else: 90346b43bff003ceda46cf9a5d40a47f7674996d2e0Zhen Wang return None 904