1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" Python 'utf-8-sig' Codec 2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehThis work similar to UTF-8 with the following changes: 3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the 5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh first three bytes. 6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these 8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh bytes will be skipped. 9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" 10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport codecs 11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh### Codec APIs 13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef encode(input, errors='strict'): 15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) 16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef decode(input, errors='strict'): 18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix = 0 19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if input[:3] == codecs.BOM_UTF8: 20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh input = input[3:] 21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh prefix = 3 22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh (output, consumed) = codecs.utf_8_decode(input, errors, True) 23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (output, consumed+prefix) 24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass IncrementalEncoder(codecs.IncrementalEncoder): 26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, errors='strict'): 27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh codecs.IncrementalEncoder.__init__(self, errors) 28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = 1 29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def encode(self, input, final=False): 31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.first: 32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = 0 33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] 34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return codecs.utf_8_encode(input, self.errors)[0] 36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def reset(self): 38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh codecs.IncrementalEncoder.reset(self) 39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = 1 40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def getstate(self): 42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return self.first 43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def setstate(self, state): 45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = state 46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass IncrementalDecoder(codecs.BufferedIncrementalDecoder): 48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, errors='strict'): 49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh codecs.BufferedIncrementalDecoder.__init__(self, errors) 50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = True 51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def _buffer_decode(self, input, errors, final): 53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.first: 54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(input) < 3: 55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if codecs.BOM_UTF8.startswith(input): 56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # not enough data to decide if this really is a BOM 57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # => try again on the next call 58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (u"", 0) 59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = None 61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = None 63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if input[:3] == codecs.BOM_UTF8: 64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) 65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (output, consumed+3) 66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return codecs.utf_8_decode(input, errors, final) 67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def reset(self): 69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh codecs.BufferedIncrementalDecoder.reset(self) 70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.first = True 71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass StreamWriter(codecs.StreamWriter): 73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def reset(self): 74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh codecs.StreamWriter.reset(self) 75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del self.encode 77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except AttributeError: 78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def encode(self, input, errors='strict'): 81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.encode = codecs.utf_8_encode 82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return encode(input, errors) 83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass StreamReader(codecs.StreamReader): 85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def reset(self): 86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh codecs.StreamReader.reset(self) 87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh try: 88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh del self.decode 89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh except AttributeError: 90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh pass 91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def decode(self, input, errors='strict'): 93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(input) < 3: 94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if codecs.BOM_UTF8.startswith(input): 95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # not enough data to decide if this is a BOM 96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # => try again on the next call 97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (u"", 0) 98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif input[:3] == codecs.BOM_UTF8: 99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.decode = codecs.utf_8_decode 100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh (output, consumed) = codecs.utf_8_decode(input[3:],errors) 101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (output, consumed+3) 102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # (else) no BOM present 103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.decode = codecs.utf_8_decode 104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return codecs.utf_8_decode(input, errors) 105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh### encodings module API 107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef getregentry(): 109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return codecs.CodecInfo( 110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh name='utf-8-sig', 111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh encode=encode, 112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh decode=decode, 113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh incrementalencoder=IncrementalEncoder, 114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh incrementaldecoder=IncrementalDecoder, 115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh streamreader=StreamReader, 116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh streamwriter=StreamWriter, 117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ) 118