183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh""" Python 'utf-8-sig' Codec
283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehThis work similar to UTF-8 with the following changes:
383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh  first three bytes.
683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh  bytes will be skipped.
983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh"""
1083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehimport codecs
1183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Codec APIs
1383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef encode(input, errors='strict'):
1583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
1683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef decode(input, errors='strict'):
1883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    prefix = 0
1983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if input[:3] == codecs.BOM_UTF8:
2083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        input = input[3:]
2183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        prefix = 3
2283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    (output, consumed) = codecs.utf_8_decode(input, errors, True)
2383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return (output, consumed+prefix)
2483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
2583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass IncrementalEncoder(codecs.IncrementalEncoder):
2683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, errors='strict'):
2783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.IncrementalEncoder.__init__(self, errors)
2883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.first = 1
2983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
3083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def encode(self, input, final=False):
3183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if self.first:
3283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.first = 0
3383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
3483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        else:
3583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return codecs.utf_8_encode(input, self.errors)[0]
3683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
3783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
3883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.IncrementalEncoder.reset(self)
3983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.first = 1
4083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
4183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def getstate(self):
4283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.first
4383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
4483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def setstate(self, state):
4583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.first = state
4683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
4783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass IncrementalDecoder(codecs.BufferedIncrementalDecoder):
4883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, errors='strict'):
4983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.BufferedIncrementalDecoder.__init__(self, errors)
5083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.first = True
5183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
5283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def _buffer_decode(self, input, errors, final):
5383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if self.first:
5483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if len(input) < 3:
5583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if codecs.BOM_UTF8.startswith(input):
5683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    # not enough data to decide if this really is a BOM
5783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    # => try again on the next call
5883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    return (u"", 0)
5983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                else:
6083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    self.first = None
6183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            else:
6283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                self.first = None
6383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if input[:3] == codecs.BOM_UTF8:
6483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
6583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    return (output, consumed+3)
6683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return codecs.utf_8_decode(input, errors, final)
6783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
6883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
6983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.BufferedIncrementalDecoder.reset(self)
7083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.first = True
7183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
7283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass StreamWriter(codecs.StreamWriter):
7383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
7483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.StreamWriter.reset(self)
7583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        try:
7683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            del self.encode
7783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        except AttributeError:
7883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            pass
7983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
8083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def encode(self, input, errors='strict'):
8183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.encode = codecs.utf_8_encode
8283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return encode(input, errors)
8383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
8483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass StreamReader(codecs.StreamReader):
8583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
8683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.StreamReader.reset(self)
8783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        try:
8883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            del self.decode
8983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        except AttributeError:
9083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            pass
9183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
9283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def decode(self, input, errors='strict'):
9383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if len(input) < 3:
9483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if codecs.BOM_UTF8.startswith(input):
9583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # not enough data to decide if this is a BOM
9683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # => try again on the next call
9783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                return (u"", 0)
9883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        elif input[:3] == codecs.BOM_UTF8:
9983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.decode = codecs.utf_8_decode
10083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
10183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return (output, consumed+3)
10283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # (else) no BOM present
10383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.decode = codecs.utf_8_decode
10483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return codecs.utf_8_decode(input, errors)
10583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
10683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### encodings module API
10783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
10883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getregentry():
10983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return codecs.CodecInfo(
11083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        name='utf-8-sig',
11183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        encode=encode,
11283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        decode=decode,
11383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        incrementalencoder=IncrementalEncoder,
11483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        incrementaldecoder=IncrementalDecoder,
11583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        streamreader=StreamReader,
11683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        streamwriter=StreamWriter,
11783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    )
118