13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" Python 'utf-8-sig' Codec
23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielThis work similar to UTF-8 with the following changes:
33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel  first three bytes.
63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel  bytes will be skipped.
93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""
103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport codecs
113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel### Codec APIs
133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef encode(input, errors='strict'):
153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef decode(input, errors='strict'):
183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    prefix = 0
193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if input[:3] == codecs.BOM_UTF8:
203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        input = input[3:]
213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        prefix = 3
223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    (output, consumed) = codecs.utf_8_decode(input, errors, True)
233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    return (output, consumed+prefix)
243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass IncrementalEncoder(codecs.IncrementalEncoder):
263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self, errors='strict'):
273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecs.IncrementalEncoder.__init__(self, errors)
283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.first = 1
293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def encode(self, input, final=False):
313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.first:
323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.first = 0
333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return codecs.utf_8_encode(input, self.errors)[0]
363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def reset(self):
383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecs.IncrementalEncoder.reset(self)
393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.first = 1
403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def getstate(self):
423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return self.first
433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def setstate(self, state):
453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.first = state
463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass IncrementalDecoder(codecs.BufferedIncrementalDecoder):
483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def __init__(self, errors='strict'):
493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecs.BufferedIncrementalDecoder.__init__(self, errors)
503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.first = True
513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def _buffer_decode(self, input, errors, final):
533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if self.first:
543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if len(input) < 3:
553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if codecs.BOM_UTF8.startswith(input):
563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # not enough data to decide if this really is a BOM
573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    # => try again on the next call
583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    return (u"", 0)
593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                else:
603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    self.first = None
613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            else:
623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                self.first = None
633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                if input[:3] == codecs.BOM_UTF8:
643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    return (output, consumed+3)
663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return codecs.utf_8_decode(input, errors, final)
673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def reset(self):
693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecs.BufferedIncrementalDecoder.reset(self)
703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.first = True
713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass StreamWriter(codecs.StreamWriter):
733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def reset(self):
743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecs.StreamWriter.reset(self)
753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try:
763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            del self.encode
773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        except AttributeError:
783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            pass
793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def encode(self, input, errors='strict'):
813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.encode = codecs.utf_8_encode
823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return encode(input, errors)
833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass StreamReader(codecs.StreamReader):
853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def reset(self):
863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecs.StreamReader.reset(self)
873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try:
883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            del self.decode
893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        except AttributeError:
903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            pass
913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    def decode(self, input, errors='strict'):
933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if len(input) < 3:
943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if codecs.BOM_UTF8.startswith(input):
953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                # not enough data to decide if this is a BOM
963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                # => try again on the next call
973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                return (u"", 0)
983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        elif input[:3] == codecs.BOM_UTF8:
993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            self.decode = codecs.utf_8_decode
1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            return (output, consumed+3)
1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # (else) no BOM present
1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        self.decode = codecs.utf_8_decode
1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return codecs.utf_8_decode(input, errors)
1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel### encodings module API
1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef getregentry():
1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    return codecs.CodecInfo(
1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        name='utf-8-sig',
1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        encode=encode,
1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        decode=decode,
1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        incrementalencoder=IncrementalEncoder,
1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        incrementaldecoder=IncrementalDecoder,
1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        streamreader=StreamReader,
1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        streamwriter=StreamWriter,
1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    )
118