14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao""" Python 'utf-8-sig' Codec
24adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoThis work similar to UTF-8 with the following changes:
34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao  first three bytes.
64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao  bytes will be skipped.
94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao"""
104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport codecs
114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao### Codec APIs
134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef encode(input, errors='strict'):
154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef decode(input, errors='strict'):
184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    prefix = 0
194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if input[:3] == codecs.BOM_UTF8:
204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        input = input[3:]
214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        prefix = 3
224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    (output, consumed) = codecs.utf_8_decode(input, errors, True)
234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return (output, consumed+prefix)
244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass IncrementalEncoder(codecs.IncrementalEncoder):
264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __init__(self, errors='strict'):
274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        codecs.IncrementalEncoder.__init__(self, errors)
284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.first = 1
294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def encode(self, input, final=False):
314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if self.first:
324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.first = 0
334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        else:
354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return codecs.utf_8_encode(input, self.errors)[0]
364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def reset(self):
384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        codecs.IncrementalEncoder.reset(self)
394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.first = 1
404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def getstate(self):
424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return self.first
434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def setstate(self, state):
454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.first = state
464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass IncrementalDecoder(codecs.BufferedIncrementalDecoder):
484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __init__(self, errors='strict'):
494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        codecs.BufferedIncrementalDecoder.__init__(self, errors)
504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.first = True
514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def _buffer_decode(self, input, errors, final):
534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if self.first:
544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if len(input) < 3:
554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if codecs.BOM_UTF8.startswith(input):
564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # not enough data to decide if this really is a BOM
574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # => try again on the next call
584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return (u"", 0)
594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    self.first = None
614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.first = None
634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if input[:3] == codecs.BOM_UTF8:
644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    return (output, consumed+3)
664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return codecs.utf_8_decode(input, errors, final)
674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def reset(self):
694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        codecs.BufferedIncrementalDecoder.reset(self)
704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.first = True
714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StreamWriter(codecs.StreamWriter):
734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def reset(self):
744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        codecs.StreamWriter.reset(self)
754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            del self.encode
774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except AttributeError:
784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            pass
794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def encode(self, input, errors='strict'):
814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.encode = codecs.utf_8_encode
824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return encode(input, errors)
834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StreamReader(codecs.StreamReader):
854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def reset(self):
864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        codecs.StreamReader.reset(self)
874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            del self.decode
894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except AttributeError:
904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            pass
914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def decode(self, input, errors='strict'):
934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if len(input) < 3:
944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if codecs.BOM_UTF8.startswith(input):
954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # not enough data to decide if this is a BOM
964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # => try again on the next call
974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                return (u"", 0)
984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        elif input[:3] == codecs.BOM_UTF8:
994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.decode = codecs.utf_8_decode
1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return (output, consumed+3)
1024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        # (else) no BOM present
1034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.decode = codecs.utf_8_decode
1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return codecs.utf_8_decode(input, errors)
1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao### encodings module API
1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef getregentry():
1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return codecs.CodecInfo(
1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        name='utf-8-sig',
1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        encode=encode,
1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        decode=decode,
1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        incrementalencoder=IncrementalEncoder,
1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        incrementaldecoder=IncrementalDecoder,
1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        streamreader=StreamReader,
1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        streamwriter=StreamWriter,
1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    )
118