14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao""" Python 'utf-8-sig' Codec 24adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoThis work similar to UTF-8 with the following changes: 34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the 54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao first three bytes. 64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these 84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao bytes will be skipped. 94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao""" 104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport codecs 114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao### Codec APIs 134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef encode(input, errors='strict'): 154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) 164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef decode(input, errors='strict'): 184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao prefix = 0 194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if input[:3] == codecs.BOM_UTF8: 204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao input = input[3:] 214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao prefix = 3 224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (output, consumed) = codecs.utf_8_decode(input, errors, True) 234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return (output, consumed+prefix) 244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass IncrementalEncoder(codecs.IncrementalEncoder): 264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def __init__(self, errors='strict'): 274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao codecs.IncrementalEncoder.__init__(self, errors) 284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = 1 294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def encode(self, input, final=False): 314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.first: 324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = 0 334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] 344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return codecs.utf_8_encode(input, self.errors)[0] 364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def reset(self): 384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao codecs.IncrementalEncoder.reset(self) 394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = 1 404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def getstate(self): 424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return self.first 434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def setstate(self, state): 454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = state 464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass IncrementalDecoder(codecs.BufferedIncrementalDecoder): 484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def __init__(self, errors='strict'): 494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao codecs.BufferedIncrementalDecoder.__init__(self, errors) 504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = True 514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def _buffer_decode(self, input, errors, final): 534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if self.first: 544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if len(input) < 3: 554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if codecs.BOM_UTF8.startswith(input): 564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # not enough data to decide if this really is a BOM 574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # => try again on the next call 584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return (u"", 0) 594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = None 614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = None 634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if input[:3] == codecs.BOM_UTF8: 644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) 654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return (output, consumed+3) 664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return codecs.utf_8_decode(input, errors, final) 674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def reset(self): 694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao codecs.BufferedIncrementalDecoder.reset(self) 704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.first = True 714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StreamWriter(codecs.StreamWriter): 734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def reset(self): 744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao codecs.StreamWriter.reset(self) 754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao try: 764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao del self.encode 774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao except AttributeError: 784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def encode(self, input, errors='strict'): 814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.encode = codecs.utf_8_encode 824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return encode(input, errors) 834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StreamReader(codecs.StreamReader): 854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def reset(self): 864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao codecs.StreamReader.reset(self) 874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao try: 884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao del self.decode 894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao except AttributeError: 904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def decode(self, input, errors='strict'): 934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if len(input) < 3: 944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if codecs.BOM_UTF8.startswith(input): 954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # not enough data to decide if this is a BOM 964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # => try again on the next call 974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return (u"", 0) 984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif input[:3] == codecs.BOM_UTF8: 994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.decode = codecs.utf_8_decode 1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (output, consumed) = codecs.utf_8_decode(input[3:],errors) 1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return (output, consumed+3) 1024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # (else) no BOM present 1034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.decode = codecs.utf_8_decode 1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return codecs.utf_8_decode(input, errors) 1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao### encodings module API 1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef getregentry(): 1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return codecs.CodecInfo( 1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao name='utf-8-sig', 1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao encode=encode, 1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao decode=decode, 1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao incrementalencoder=IncrementalEncoder, 1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao incrementaldecoder=IncrementalDecoder, 1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao streamreader=StreamReader, 1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao streamwriter=StreamWriter, 1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao ) 118