1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" Python 'utf-8-sig' Codec
2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew HsiehThis work similar to UTF-8 with the following changes:
3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh  first three bytes.
6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh  bytes will be skipped.
9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""
10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport codecs
11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh### Codec APIs
13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef encode(input, errors='strict'):
15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef decode(input, errors='strict'):
18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    prefix = 0
19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    if input[:3] == codecs.BOM_UTF8:
20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        input = input[3:]
21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        prefix = 3
22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    (output, consumed) = codecs.utf_8_decode(input, errors, True)
23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return (output, consumed+prefix)
24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass IncrementalEncoder(codecs.IncrementalEncoder):
26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, errors='strict'):
27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codecs.IncrementalEncoder.__init__(self, errors)
28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.first = 1
29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def encode(self, input, final=False):
31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.first:
32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.first = 0
33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return codecs.utf_8_encode(input, self.errors)[0]
36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def reset(self):
38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codecs.IncrementalEncoder.reset(self)
39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.first = 1
40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def getstate(self):
42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return self.first
43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def setstate(self, state):
45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.first = state
46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass IncrementalDecoder(codecs.BufferedIncrementalDecoder):
48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, errors='strict'):
49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codecs.BufferedIncrementalDecoder.__init__(self, errors)
50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.first = True
51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _buffer_decode(self, input, errors, final):
53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.first:
54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if len(input) < 3:
55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if codecs.BOM_UTF8.startswith(input):
56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # not enough data to decide if this really is a BOM
57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    # => try again on the next call
58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    return (u"", 0)
59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                else:
60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    self.first = None
61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            else:
62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                self.first = None
63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if input[:3] == codecs.BOM_UTF8:
64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    return (output, consumed+3)
66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return codecs.utf_8_decode(input, errors, final)
67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def reset(self):
69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codecs.BufferedIncrementalDecoder.reset(self)
70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.first = True
71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass StreamWriter(codecs.StreamWriter):
73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def reset(self):
74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codecs.StreamWriter.reset(self)
75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        try:
76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            del self.encode
77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        except AttributeError:
78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            pass
79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def encode(self, input, errors='strict'):
81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.encode = codecs.utf_8_encode
82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return encode(input, errors)
83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass StreamReader(codecs.StreamReader):
85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def reset(self):
86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        codecs.StreamReader.reset(self)
87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        try:
88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            del self.decode
89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        except AttributeError:
90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            pass
91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def decode(self, input, errors='strict'):
93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if len(input) < 3:
94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if codecs.BOM_UTF8.startswith(input):
95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # not enough data to decide if this is a BOM
96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # => try again on the next call
97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return (u"", 0)
98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif input[:3] == codecs.BOM_UTF8:
99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.decode = codecs.utf_8_decode
100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return (output, consumed+3)
102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # (else) no BOM present
103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.decode = codecs.utf_8_decode
104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return codecs.utf_8_decode(input, errors)
105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh### encodings module API
107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehdef getregentry():
109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    return codecs.CodecInfo(
110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        name='utf-8-sig',
111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        encode=encode,
112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        decode=decode,
113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        incrementalencoder=IncrementalEncoder,
114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        incrementaldecoder=IncrementalDecoder,
115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        streamreader=StreamReader,
116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        streamwriter=StreamWriter,
117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    )
118