1""" Python 'utf-8-sig' Codec
2This work similar to UTF-8 with the following changes:
3
4* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5  first three bytes.
6
7* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8  bytes will be skipped.
9"""
10import codecs
11
12### Codec APIs
13
14def encode(input, errors='strict'):
15    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
16
17def decode(input, errors='strict'):
18    prefix = 0
19    if input[:3] == codecs.BOM_UTF8:
20        input = input[3:]
21        prefix = 3
22    (output, consumed) = codecs.utf_8_decode(input, errors, True)
23    return (output, consumed+prefix)
24
25class IncrementalEncoder(codecs.IncrementalEncoder):
26    def __init__(self, errors='strict'):
27        codecs.IncrementalEncoder.__init__(self, errors)
28        self.first = 1
29
30    def encode(self, input, final=False):
31        if self.first:
32            self.first = 0
33            return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
34        else:
35            return codecs.utf_8_encode(input, self.errors)[0]
36
37    def reset(self):
38        codecs.IncrementalEncoder.reset(self)
39        self.first = 1
40
41    def getstate(self):
42        return self.first
43
44    def setstate(self, state):
45        self.first = state
46
47class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
48    def __init__(self, errors='strict'):
49        codecs.BufferedIncrementalDecoder.__init__(self, errors)
50        self.first = True
51
52    def _buffer_decode(self, input, errors, final):
53        if self.first:
54            if len(input) < 3:
55                if codecs.BOM_UTF8.startswith(input):
56                    # not enough data to decide if this really is a BOM
57                    # => try again on the next call
58                    return (u"", 0)
59                else:
60                    self.first = None
61            else:
62                self.first = None
63                if input[:3] == codecs.BOM_UTF8:
64                    (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
65                    return (output, consumed+3)
66        return codecs.utf_8_decode(input, errors, final)
67
68    def reset(self):
69        codecs.BufferedIncrementalDecoder.reset(self)
70        self.first = True
71
72class StreamWriter(codecs.StreamWriter):
73    def reset(self):
74        codecs.StreamWriter.reset(self)
75        try:
76            del self.encode
77        except AttributeError:
78            pass
79
80    def encode(self, input, errors='strict'):
81        self.encode = codecs.utf_8_encode
82        return encode(input, errors)
83
84class StreamReader(codecs.StreamReader):
85    def reset(self):
86        codecs.StreamReader.reset(self)
87        try:
88            del self.decode
89        except AttributeError:
90            pass
91
92    def decode(self, input, errors='strict'):
93        if len(input) < 3:
94            if codecs.BOM_UTF8.startswith(input):
95                # not enough data to decide if this is a BOM
96                # => try again on the next call
97                return (u"", 0)
98        elif input[:3] == codecs.BOM_UTF8:
99            self.decode = codecs.utf_8_decode
100            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
101            return (output, consumed+3)
102        # (else) no BOM present
103        self.decode = codecs.utf_8_decode
104        return codecs.utf_8_decode(input, errors)
105
106### encodings module API
107
108def getregentry():
109    return codecs.CodecInfo(
110        name='utf-8-sig',
111        encode=encode,
112        decode=decode,
113        incrementalencoder=IncrementalEncoder,
114        incrementaldecoder=IncrementalDecoder,
115        streamreader=StreamReader,
116        streamwriter=StreamWriter,
117    )
118