13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" Python 'utf-8-sig' Codec 23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielThis work similar to UTF-8 with the following changes: 33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the 53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel first three bytes. 63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these 83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel bytes will be skipped. 93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" 103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport codecs 113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel### Codec APIs 133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef encode(input, errors='strict'): 153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) 163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef decode(input, errors='strict'): 183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel prefix = 0 193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if input[:3] == codecs.BOM_UTF8: 203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel input = input[3:] 213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel prefix = 3 223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (output, consumed) = codecs.utf_8_decode(input, errors, True) 233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (output, consumed+prefix) 243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass IncrementalEncoder(codecs.IncrementalEncoder): 263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self, errors='strict'): 273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecs.IncrementalEncoder.__init__(self, errors) 283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = 1 293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def encode(self, input, final=False): 313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.first: 323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = 0 333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] 343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return codecs.utf_8_encode(input, self.errors)[0] 363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def reset(self): 383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecs.IncrementalEncoder.reset(self) 393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = 1 403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def getstate(self): 423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return self.first 433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def setstate(self, state): 453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = state 463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass IncrementalDecoder(codecs.BufferedIncrementalDecoder): 483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def __init__(self, errors='strict'): 493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecs.BufferedIncrementalDecoder.__init__(self, errors) 503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = True 513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def _buffer_decode(self, input, errors, final): 533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if self.first: 543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(input) < 3: 553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if codecs.BOM_UTF8.startswith(input): 563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # not enough data to decide if this really is a BOM 573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # => try again on the next call 583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (u"", 0) 593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = None 613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = None 633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if input[:3] == codecs.BOM_UTF8: 643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) 653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (output, consumed+3) 663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return codecs.utf_8_decode(input, errors, final) 673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def reset(self): 693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecs.BufferedIncrementalDecoder.reset(self) 703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.first = True 713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass StreamWriter(codecs.StreamWriter): 733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def reset(self): 743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecs.StreamWriter.reset(self) 753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel del self.encode 773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except AttributeError: 783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def encode(self, input, errors='strict'): 813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.encode = codecs.utf_8_encode 823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return encode(input, errors) 833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass StreamReader(codecs.StreamReader): 853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def reset(self): 863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecs.StreamReader.reset(self) 873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel del self.decode 893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except AttributeError: 903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel def decode(self, input, errors='strict'): 933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(input) < 3: 943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if codecs.BOM_UTF8.startswith(input): 953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # not enough data to decide if this is a BOM 963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # => try again on the next call 973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (u"", 0) 983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel elif input[:3] == codecs.BOM_UTF8: 993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.decode = codecs.utf_8_decode 1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (output, consumed) = codecs.utf_8_decode(input[3:],errors) 1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return (output, consumed+3) 1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # (else) no BOM present 1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel self.decode = codecs.utf_8_decode 1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return codecs.utf_8_decode(input, errors) 1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel### encodings module API 1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef getregentry(): 1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return codecs.CodecInfo( 1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel name='utf-8-sig', 1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel encode=encode, 1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel decode=decode, 1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel incrementalencoder=IncrementalEncoder, 1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel incrementaldecoder=IncrementalDecoder, 1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel streamreader=StreamReader, 1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel streamwriter=StreamWriter, 1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ) 118