1from __future__ import absolute_import, division, unicode_literals 2from six import text_type 3from six.moves import http_client 4 5import codecs 6import re 7 8from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase 9from .constants import encodings, ReparseException 10from . import utils 11 12from io import StringIO 13 14try: 15 from io import BytesIO 16except ImportError: 17 BytesIO = StringIO 18 19try: 20 from io import BufferedIOBase 21except ImportError: 22 class BufferedIOBase(object): 23 pass 24 25# Non-unicode versions of constants for use in the pre-parser 26spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) 27asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) 28asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) 29spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) 30 31 32invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" 33 34if utils.supports_lone_surrogates: 35 # Use one extra step of indirection and create surrogates with 36 # unichr. Not using this indirection would introduce an illegal 37 # unicode literal on platforms not supporting such lone 38 # surrogates. 39 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + 40 eval('"\\uD800-\\uDFFF"')) 41else: 42 invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) 43 44non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 45 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 46 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 47 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 48 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 49 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 50 0x10FFFE, 0x10FFFF]) 51 52ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") 53 54# Cache for charsUntil() 55charsUntilRegEx = {} 56 57 58class BufferedStream(object): 59 """Buffering for streams that do not have buffering of their own 60 61 The buffer is implemented as a list of chunks on the assumption that 62 joining many strings will be slow since it is O(n**2) 63 """ 64 65 def __init__(self, stream): 66 self.stream = stream 67 self.buffer = [] 68 self.position = [-1, 0] # chunk number, offset 69 70 def tell(self): 71 pos = 0 72 for chunk in self.buffer[:self.position[0]]: 73 pos += len(chunk) 74 pos += self.position[1] 75 return pos 76 77 def seek(self, pos): 78 assert pos <= self._bufferedBytes() 79 offset = pos 80 i = 0 81 while len(self.buffer[i]) < offset: 82 offset -= len(self.buffer[i]) 83 i += 1 84 self.position = [i, offset] 85 86 def read(self, bytes): 87 if not self.buffer: 88 return self._readStream(bytes) 89 elif (self.position[0] == len(self.buffer) and 90 self.position[1] == len(self.buffer[-1])): 91 return self._readStream(bytes) 92 else: 93 return self._readFromBuffer(bytes) 94 95 def _bufferedBytes(self): 96 return sum([len(item) for item in self.buffer]) 97 98 def _readStream(self, bytes): 99 data = self.stream.read(bytes) 100 self.buffer.append(data) 101 self.position[0] += 1 102 self.position[1] = len(data) 103 return data 104 105 def _readFromBuffer(self, bytes): 106 remainingBytes = bytes 107 rv = [] 108 bufferIndex = self.position[0] 109 bufferOffset = self.position[1] 110 while bufferIndex < len(self.buffer) and remainingBytes != 0: 111 assert remainingBytes > 0 112 bufferedData = self.buffer[bufferIndex] 113 114 if remainingBytes <= len(bufferedData) - bufferOffset: 115 bytesToRead = remainingBytes 116 self.position = [bufferIndex, bufferOffset + bytesToRead] 117 else: 118 bytesToRead = len(bufferedData) - bufferOffset 119 self.position = [bufferIndex, len(bufferedData)] 120 bufferIndex += 1 121 rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead]) 122 remainingBytes -= bytesToRead 123 124 bufferOffset = 0 125 126 if remainingBytes: 127 rv.append(self._readStream(remainingBytes)) 128 129 return b"".join(rv) 130 131 132def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): 133 if isinstance(source, http_client.HTTPResponse): 134 # Work around Python bug #20007: read(0) closes the connection. 135 # http://bugs.python.org/issue20007 136 isUnicode = False 137 elif hasattr(source, "read"): 138 isUnicode = isinstance(source.read(0), text_type) 139 else: 140 isUnicode = isinstance(source, text_type) 141 142 if isUnicode: 143 if encoding is not None: 144 raise TypeError("Cannot explicitly set an encoding with a unicode string") 145 146 return HTMLUnicodeInputStream(source) 147 else: 148 return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) 149 150 151class HTMLUnicodeInputStream(object): 152 """Provides a unicode stream of characters to the HTMLTokenizer. 153 154 This class takes care of character encoding and removing or replacing 155 incorrect byte-sequences and also provides column and line tracking. 156 157 """ 158 159 _defaultChunkSize = 10240 160 161 def __init__(self, source): 162 """Initialises the HTMLInputStream. 163 164 HTMLInputStream(source, [encoding]) -> Normalized stream from source 165 for use by html5lib. 166 167 source can be either a file-object, local filename or a string. 168 169 The optional encoding parameter must be a string that indicates 170 the encoding. If specified, that encoding will be used, 171 regardless of any BOM or later declaration (such as in a meta 172 element) 173 174 parseMeta - Look for a <meta> element containing encoding information 175 176 """ 177 178 if not utils.supports_lone_surrogates: 179 # Such platforms will have already checked for such 180 # surrogate errors, so no need to do this checking. 181 self.reportCharacterErrors = None 182 self.replaceCharactersRegexp = None 183 elif len("\U0010FFFF") == 1: 184 self.reportCharacterErrors = self.characterErrorsUCS4 185 self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"')) 186 else: 187 self.reportCharacterErrors = self.characterErrorsUCS2 188 self.replaceCharactersRegexp = re.compile( 189 eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"')) 190 191 # List of where new lines occur 192 self.newLines = [0] 193 194 self.charEncoding = ("utf-8", "certain") 195 self.dataStream = self.openStream(source) 196 197 self.reset() 198 199 def reset(self): 200 self.chunk = "" 201 self.chunkSize = 0 202 self.chunkOffset = 0 203 self.errors = [] 204 205 # number of (complete) lines in previous chunks 206 self.prevNumLines = 0 207 # number of columns in the last line of the previous chunk 208 self.prevNumCols = 0 209 210 # Deal with CR LF and surrogates split over chunk boundaries 211 self._bufferedCharacter = None 212 213 def openStream(self, source): 214 """Produces a file object from source. 215 216 source can be either a file object, local filename or a string. 217 218 """ 219 # Already a file object 220 if hasattr(source, 'read'): 221 stream = source 222 else: 223 stream = StringIO(source) 224 225 return stream 226 227 def _position(self, offset): 228 chunk = self.chunk 229 nLines = chunk.count('\n', 0, offset) 230 positionLine = self.prevNumLines + nLines 231 lastLinePos = chunk.rfind('\n', 0, offset) 232 if lastLinePos == -1: 233 positionColumn = self.prevNumCols + offset 234 else: 235 positionColumn = offset - (lastLinePos + 1) 236 return (positionLine, positionColumn) 237 238 def position(self): 239 """Returns (line, col) of the current position in the stream.""" 240 line, col = self._position(self.chunkOffset) 241 return (line + 1, col) 242 243 def char(self): 244 """ Read one character from the stream or queue if available. Return 245 EOF when EOF is reached. 246 """ 247 # Read a new chunk from the input stream if necessary 248 if self.chunkOffset >= self.chunkSize: 249 if not self.readChunk(): 250 return EOF 251 252 chunkOffset = self.chunkOffset 253 char = self.chunk[chunkOffset] 254 self.chunkOffset = chunkOffset + 1 255 256 return char 257 258 def readChunk(self, chunkSize=None): 259 if chunkSize is None: 260 chunkSize = self._defaultChunkSize 261 262 self.prevNumLines, self.prevNumCols = self._position(self.chunkSize) 263 264 self.chunk = "" 265 self.chunkSize = 0 266 self.chunkOffset = 0 267 268 data = self.dataStream.read(chunkSize) 269 270 # Deal with CR LF and surrogates broken across chunks 271 if self._bufferedCharacter: 272 data = self._bufferedCharacter + data 273 self._bufferedCharacter = None 274 elif not data: 275 # We have no more data, bye-bye stream 276 return False 277 278 if len(data) > 1: 279 lastv = ord(data[-1]) 280 if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF: 281 self._bufferedCharacter = data[-1] 282 data = data[:-1] 283 284 if self.reportCharacterErrors: 285 self.reportCharacterErrors(data) 286 287 # Replace invalid characters 288 # Note U+0000 is dealt with in the tokenizer 289 data = self.replaceCharactersRegexp.sub("\ufffd", data) 290 291 data = data.replace("\r\n", "\n") 292 data = data.replace("\r", "\n") 293 294 self.chunk = data 295 self.chunkSize = len(data) 296 297 return True 298 299 def characterErrorsUCS4(self, data): 300 for i in range(len(invalid_unicode_re.findall(data))): 301 self.errors.append("invalid-codepoint") 302 303 def characterErrorsUCS2(self, data): 304 # Someone picked the wrong compile option 305 # You lose 306 skip = False 307 for match in invalid_unicode_re.finditer(data): 308 if skip: 309 continue 310 codepoint = ord(match.group()) 311 pos = match.start() 312 # Pretty sure there should be endianness issues here 313 if utils.isSurrogatePair(data[pos:pos + 2]): 314 # We have a surrogate pair! 315 char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2]) 316 if char_val in non_bmp_invalid_codepoints: 317 self.errors.append("invalid-codepoint") 318 skip = True 319 elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and 320 pos == len(data) - 1): 321 self.errors.append("invalid-codepoint") 322 else: 323 skip = False 324 self.errors.append("invalid-codepoint") 325 326 def charsUntil(self, characters, opposite=False): 327 """ Returns a string of characters from the stream up to but not 328 including any character in 'characters' or EOF. 'characters' must be 329 a container that supports the 'in' method and iteration over its 330 characters. 331 """ 332 333 # Use a cache of regexps to find the required characters 334 try: 335 chars = charsUntilRegEx[(characters, opposite)] 336 except KeyError: 337 if __debug__: 338 for c in characters: 339 assert(ord(c) < 128) 340 regex = "".join(["\\x%02x" % ord(c) for c in characters]) 341 if not opposite: 342 regex = "^%s" % regex 343 chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex) 344 345 rv = [] 346 347 while True: 348 # Find the longest matching prefix 349 m = chars.match(self.chunk, self.chunkOffset) 350 if m is None: 351 # If nothing matched, and it wasn't because we ran out of chunk, 352 # then stop 353 if self.chunkOffset != self.chunkSize: 354 break 355 else: 356 end = m.end() 357 # If not the whole chunk matched, return everything 358 # up to the part that didn't match 359 if end != self.chunkSize: 360 rv.append(self.chunk[self.chunkOffset:end]) 361 self.chunkOffset = end 362 break 363 # If the whole remainder of the chunk matched, 364 # use it all and read the next chunk 365 rv.append(self.chunk[self.chunkOffset:]) 366 if not self.readChunk(): 367 # Reached EOF 368 break 369 370 r = "".join(rv) 371 return r 372 373 def unget(self, char): 374 # Only one character is allowed to be ungotten at once - it must 375 # be consumed again before any further call to unget 376 if char is not None: 377 if self.chunkOffset == 0: 378 # unget is called quite rarely, so it's a good idea to do 379 # more work here if it saves a bit of work in the frequently 380 # called char and charsUntil. 381 # So, just prepend the ungotten character onto the current 382 # chunk: 383 self.chunk = char + self.chunk 384 self.chunkSize += 1 385 else: 386 self.chunkOffset -= 1 387 assert self.chunk[self.chunkOffset] == char 388 389 390class HTMLBinaryInputStream(HTMLUnicodeInputStream): 391 """Provides a unicode stream of characters to the HTMLTokenizer. 392 393 This class takes care of character encoding and removing or replacing 394 incorrect byte-sequences and also provides column and line tracking. 395 396 """ 397 398 def __init__(self, source, encoding=None, parseMeta=True, chardet=True): 399 """Initialises the HTMLInputStream. 400 401 HTMLInputStream(source, [encoding]) -> Normalized stream from source 402 for use by html5lib. 403 404 source can be either a file-object, local filename or a string. 405 406 The optional encoding parameter must be a string that indicates 407 the encoding. If specified, that encoding will be used, 408 regardless of any BOM or later declaration (such as in a meta 409 element) 410 411 parseMeta - Look for a <meta> element containing encoding information 412 413 """ 414 # Raw Stream - for unicode objects this will encode to utf-8 and set 415 # self.charEncoding as appropriate 416 self.rawStream = self.openStream(source) 417 418 HTMLUnicodeInputStream.__init__(self, self.rawStream) 419 420 self.charEncoding = (codecName(encoding), "certain") 421 422 # Encoding Information 423 # Number of bytes to use when looking for a meta element with 424 # encoding information 425 self.numBytesMeta = 512 426 # Number of bytes to use when using detecting encoding using chardet 427 self.numBytesChardet = 100 428 # Encoding to use if no other information can be found 429 self.defaultEncoding = "windows-1252" 430 431 # Detect encoding iff no explicit "transport level" encoding is supplied 432 if (self.charEncoding[0] is None): 433 self.charEncoding = self.detectEncoding(parseMeta, chardet) 434 435 # Call superclass 436 self.reset() 437 438 def reset(self): 439 self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, 440 'replace') 441 HTMLUnicodeInputStream.reset(self) 442 443 def openStream(self, source): 444 """Produces a file object from source. 445 446 source can be either a file object, local filename or a string. 447 448 """ 449 # Already a file object 450 if hasattr(source, 'read'): 451 stream = source 452 else: 453 stream = BytesIO(source) 454 455 try: 456 stream.seek(stream.tell()) 457 except: 458 stream = BufferedStream(stream) 459 460 return stream 461 462 def detectEncoding(self, parseMeta=True, chardet=True): 463 # First look for a BOM 464 # This will also read past the BOM if present 465 encoding = self.detectBOM() 466 confidence = "certain" 467 # If there is no BOM need to look for meta elements with encoding 468 # information 469 if encoding is None and parseMeta: 470 encoding = self.detectEncodingMeta() 471 confidence = "tentative" 472 # Guess with chardet, if avaliable 473 if encoding is None and chardet: 474 confidence = "tentative" 475 try: 476 try: 477 from charade.universaldetector import UniversalDetector 478 except ImportError: 479 from chardet.universaldetector import UniversalDetector 480 buffers = [] 481 detector = UniversalDetector() 482 while not detector.done: 483 buffer = self.rawStream.read(self.numBytesChardet) 484 assert isinstance(buffer, bytes) 485 if not buffer: 486 break 487 buffers.append(buffer) 488 detector.feed(buffer) 489 detector.close() 490 encoding = detector.result['encoding'] 491 self.rawStream.seek(0) 492 except ImportError: 493 pass 494 # If all else fails use the default encoding 495 if encoding is None: 496 confidence = "tentative" 497 encoding = self.defaultEncoding 498 499 # Substitute for equivalent encodings: 500 encodingSub = {"iso-8859-1": "windows-1252"} 501 502 if encoding.lower() in encodingSub: 503 encoding = encodingSub[encoding.lower()] 504 505 return encoding, confidence 506 507 def changeEncoding(self, newEncoding): 508 assert self.charEncoding[1] != "certain" 509 newEncoding = codecName(newEncoding) 510 if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"): 511 newEncoding = "utf-8" 512 if newEncoding is None: 513 return 514 elif newEncoding == self.charEncoding[0]: 515 self.charEncoding = (self.charEncoding[0], "certain") 516 else: 517 self.rawStream.seek(0) 518 self.reset() 519 self.charEncoding = (newEncoding, "certain") 520 raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) 521 522 def detectBOM(self): 523 """Attempts to detect at BOM at the start of the stream. If 524 an encoding can be determined from the BOM return the name of the 525 encoding otherwise return None""" 526 bomDict = { 527 codecs.BOM_UTF8: 'utf-8', 528 codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', 529 codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' 530 } 531 532 # Go to beginning of file and read in 4 bytes 533 string = self.rawStream.read(4) 534 assert isinstance(string, bytes) 535 536 # Try detecting the BOM using bytes from the string 537 encoding = bomDict.get(string[:3]) # UTF-8 538 seek = 3 539 if not encoding: 540 # Need to detect UTF-32 before UTF-16 541 encoding = bomDict.get(string) # UTF-32 542 seek = 4 543 if not encoding: 544 encoding = bomDict.get(string[:2]) # UTF-16 545 seek = 2 546 547 # Set the read position past the BOM if one was found, otherwise 548 # set it to the start of the stream 549 self.rawStream.seek(encoding and seek or 0) 550 551 return encoding 552 553 def detectEncodingMeta(self): 554 """Report the encoding declared by the meta element 555 """ 556 buffer = self.rawStream.read(self.numBytesMeta) 557 assert isinstance(buffer, bytes) 558 parser = EncodingParser(buffer) 559 self.rawStream.seek(0) 560 encoding = parser.getEncoding() 561 562 if encoding in ("utf-16", "utf-16-be", "utf-16-le"): 563 encoding = "utf-8" 564 565 return encoding 566 567 568class EncodingBytes(bytes): 569 """String-like object with an associated position and various extra methods 570 If the position is ever greater than the string length then an exception is 571 raised""" 572 def __new__(self, value): 573 assert isinstance(value, bytes) 574 return bytes.__new__(self, value.lower()) 575 576 def __init__(self, value): 577 self._position = -1 578 579 def __iter__(self): 580 return self 581 582 def __next__(self): 583 p = self._position = self._position + 1 584 if p >= len(self): 585 raise StopIteration 586 elif p < 0: 587 raise TypeError 588 return self[p:p + 1] 589 590 def next(self): 591 # Py2 compat 592 return self.__next__() 593 594 def previous(self): 595 p = self._position 596 if p >= len(self): 597 raise StopIteration 598 elif p < 0: 599 raise TypeError 600 self._position = p = p - 1 601 return self[p:p + 1] 602 603 def setPosition(self, position): 604 if self._position >= len(self): 605 raise StopIteration 606 self._position = position 607 608 def getPosition(self): 609 if self._position >= len(self): 610 raise StopIteration 611 if self._position >= 0: 612 return self._position 613 else: 614 return None 615 616 position = property(getPosition, setPosition) 617 618 def getCurrentByte(self): 619 return self[self.position:self.position + 1] 620 621 currentByte = property(getCurrentByte) 622 623 def skip(self, chars=spaceCharactersBytes): 624 """Skip past a list of characters""" 625 p = self.position # use property for the error-checking 626 while p < len(self): 627 c = self[p:p + 1] 628 if c not in chars: 629 self._position = p 630 return c 631 p += 1 632 self._position = p 633 return None 634 635 def skipUntil(self, chars): 636 p = self.position 637 while p < len(self): 638 c = self[p:p + 1] 639 if c in chars: 640 self._position = p 641 return c 642 p += 1 643 self._position = p 644 return None 645 646 def matchBytes(self, bytes): 647 """Look for a sequence of bytes at the start of a string. If the bytes 648 are found return True and advance the position to the byte after the 649 match. Otherwise return False and leave the position alone""" 650 p = self.position 651 data = self[p:p + len(bytes)] 652 rv = data.startswith(bytes) 653 if rv: 654 self.position += len(bytes) 655 return rv 656 657 def jumpTo(self, bytes): 658 """Look for the next sequence of bytes matching a given sequence. If 659 a match is found advance the position to the last byte of the match""" 660 newPosition = self[self.position:].find(bytes) 661 if newPosition > -1: 662 # XXX: This is ugly, but I can't see a nicer way to fix this. 663 if self._position == -1: 664 self._position = 0 665 self._position += (newPosition + len(bytes) - 1) 666 return True 667 else: 668 raise StopIteration 669 670 671class EncodingParser(object): 672 """Mini parser for detecting character encoding from meta elements""" 673 674 def __init__(self, data): 675 """string - the data to work on for encoding detection""" 676 self.data = EncodingBytes(data) 677 self.encoding = None 678 679 def getEncoding(self): 680 methodDispatch = ( 681 (b"<!--", self.handleComment), 682 (b"<meta", self.handleMeta), 683 (b"</", self.handlePossibleEndTag), 684 (b"<!", self.handleOther), 685 (b"<?", self.handleOther), 686 (b"<", self.handlePossibleStartTag)) 687 for byte in self.data: 688 keepParsing = True 689 for key, method in methodDispatch: 690 if self.data.matchBytes(key): 691 try: 692 keepParsing = method() 693 break 694 except StopIteration: 695 keepParsing = False 696 break 697 if not keepParsing: 698 break 699 700 return self.encoding 701 702 def handleComment(self): 703 """Skip over comments""" 704 return self.data.jumpTo(b"-->") 705 706 def handleMeta(self): 707 if self.data.currentByte not in spaceCharactersBytes: 708 # if we have <meta not followed by a space so just keep going 709 return True 710 # We have a valid meta element we want to search for attributes 711 hasPragma = False 712 pendingEncoding = None 713 while True: 714 # Try to find the next attribute after the current position 715 attr = self.getAttribute() 716 if attr is None: 717 return True 718 else: 719 if attr[0] == b"http-equiv": 720 hasPragma = attr[1] == b"content-type" 721 if hasPragma and pendingEncoding is not None: 722 self.encoding = pendingEncoding 723 return False 724 elif attr[0] == b"charset": 725 tentativeEncoding = attr[1] 726 codec = codecName(tentativeEncoding) 727 if codec is not None: 728 self.encoding = codec 729 return False 730 elif attr[0] == b"content": 731 contentParser = ContentAttrParser(EncodingBytes(attr[1])) 732 tentativeEncoding = contentParser.parse() 733 if tentativeEncoding is not None: 734 codec = codecName(tentativeEncoding) 735 if codec is not None: 736 if hasPragma: 737 self.encoding = codec 738 return False 739 else: 740 pendingEncoding = codec 741 742 def handlePossibleStartTag(self): 743 return self.handlePossibleTag(False) 744 745 def handlePossibleEndTag(self): 746 next(self.data) 747 return self.handlePossibleTag(True) 748 749 def handlePossibleTag(self, endTag): 750 data = self.data 751 if data.currentByte not in asciiLettersBytes: 752 # If the next byte is not an ascii letter either ignore this 753 # fragment (possible start tag case) or treat it according to 754 # handleOther 755 if endTag: 756 data.previous() 757 self.handleOther() 758 return True 759 760 c = data.skipUntil(spacesAngleBrackets) 761 if c == b"<": 762 # return to the first step in the overall "two step" algorithm 763 # reprocessing the < byte 764 data.previous() 765 else: 766 # Read all attributes 767 attr = self.getAttribute() 768 while attr is not None: 769 attr = self.getAttribute() 770 return True 771 772 def handleOther(self): 773 return self.data.jumpTo(b">") 774 775 def getAttribute(self): 776 """Return a name,value pair for the next attribute in the stream, 777 if one is found, or None""" 778 data = self.data 779 # Step 1 (skip chars) 780 c = data.skip(spaceCharactersBytes | frozenset([b"/"])) 781 assert c is None or len(c) == 1 782 # Step 2 783 if c in (b">", None): 784 return None 785 # Step 3 786 attrName = [] 787 attrValue = [] 788 # Step 4 attribute name 789 while True: 790 if c == b"=" and attrName: 791 break 792 elif c in spaceCharactersBytes: 793 # Step 6! 794 c = data.skip() 795 break 796 elif c in (b"/", b">"): 797 return b"".join(attrName), b"" 798 elif c in asciiUppercaseBytes: 799 attrName.append(c.lower()) 800 elif c is None: 801 return None 802 else: 803 attrName.append(c) 804 # Step 5 805 c = next(data) 806 # Step 7 807 if c != b"=": 808 data.previous() 809 return b"".join(attrName), b"" 810 # Step 8 811 next(data) 812 # Step 9 813 c = data.skip() 814 # Step 10 815 if c in (b"'", b'"'): 816 # 10.1 817 quoteChar = c 818 while True: 819 # 10.2 820 c = next(data) 821 # 10.3 822 if c == quoteChar: 823 next(data) 824 return b"".join(attrName), b"".join(attrValue) 825 # 10.4 826 elif c in asciiUppercaseBytes: 827 attrValue.append(c.lower()) 828 # 10.5 829 else: 830 attrValue.append(c) 831 elif c == b">": 832 return b"".join(attrName), b"" 833 elif c in asciiUppercaseBytes: 834 attrValue.append(c.lower()) 835 elif c is None: 836 return None 837 else: 838 attrValue.append(c) 839 # Step 11 840 while True: 841 c = next(data) 842 if c in spacesAngleBrackets: 843 return b"".join(attrName), b"".join(attrValue) 844 elif c in asciiUppercaseBytes: 845 attrValue.append(c.lower()) 846 elif c is None: 847 return None 848 else: 849 attrValue.append(c) 850 851 852class ContentAttrParser(object): 853 def __init__(self, data): 854 assert isinstance(data, bytes) 855 self.data = data 856 857 def parse(self): 858 try: 859 # Check if the attr name is charset 860 # otherwise return 861 self.data.jumpTo(b"charset") 862 self.data.position += 1 863 self.data.skip() 864 if not self.data.currentByte == b"=": 865 # If there is no = sign keep looking for attrs 866 return None 867 self.data.position += 1 868 self.data.skip() 869 # Look for an encoding between matching quote marks 870 if self.data.currentByte in (b'"', b"'"): 871 quoteMark = self.data.currentByte 872 self.data.position += 1 873 oldPosition = self.data.position 874 if self.data.jumpTo(quoteMark): 875 return self.data[oldPosition:self.data.position] 876 else: 877 return None 878 else: 879 # Unquoted value 880 oldPosition = self.data.position 881 try: 882 self.data.skipUntil(spaceCharactersBytes) 883 return self.data[oldPosition:self.data.position] 884 except StopIteration: 885 # Return the whole remaining value 886 return self.data[oldPosition:] 887 except StopIteration: 888 return None 889 890 891def codecName(encoding): 892 """Return the python codec name corresponding to an encoding or None if the 893 string doesn't correspond to a valid encoding.""" 894 if isinstance(encoding, bytes): 895 try: 896 encoding = encoding.decode("ascii") 897 except UnicodeDecodeError: 898 return None 899 if encoding: 900 canonicalName = ascii_punctuation_re.sub("", encoding).lower() 901 return encodings.get(canonicalName, None) 902 else: 903 return None 904