1#
2#   Cython -- encoding related tools
3#
4
5import re
6import sys
7
8if sys.version_info[0] >= 3:
9    _unicode, _str, _bytes = str, str, bytes
10    IS_PYTHON3 = True
11else:
12    _unicode, _str, _bytes = unicode, str, str
13    IS_PYTHON3 = False
14
15empty_bytes = _bytes()
16empty_unicode = _unicode()
17
18join_bytes = empty_bytes.join
19
20class UnicodeLiteralBuilder(object):
21    """Assemble a unicode string.
22    """
23    def __init__(self):
24        self.chars = []
25
26    def append(self, characters):
27        if isinstance(characters, _bytes):
28            # this came from a Py2 string literal in the parser code
29            characters = characters.decode("ASCII")
30        assert isinstance(characters, _unicode), str(type(characters))
31        self.chars.append(characters)
32
33    if sys.maxunicode == 65535:
34        def append_charval(self, char_number):
35            if char_number > 65535:
36                # wide Unicode character on narrow platform => replace
37                # by surrogate pair
38                char_number -= 0x10000
39                self.chars.append( unichr((char_number // 1024) + 0xD800) )
40                self.chars.append( unichr((char_number  % 1024) + 0xDC00) )
41            else:
42                self.chars.append( unichr(char_number) )
43    else:
44        def append_charval(self, char_number):
45            self.chars.append( unichr(char_number) )
46
47    def append_uescape(self, char_number, escape_string):
48        self.append_charval(char_number)
49
50    def getstring(self):
51        return EncodedString(u''.join(self.chars))
52
53    def getstrings(self):
54        return (None, self.getstring())
55
56
57class BytesLiteralBuilder(object):
58    """Assemble a byte string or char value.
59    """
60    def __init__(self, target_encoding):
61        self.chars = []
62        self.target_encoding = target_encoding
63
64    def append(self, characters):
65        if isinstance(characters, _unicode):
66            characters = characters.encode(self.target_encoding)
67        assert isinstance(characters, _bytes), str(type(characters))
68        self.chars.append(characters)
69
70    def append_charval(self, char_number):
71        self.chars.append( unichr(char_number).encode('ISO-8859-1') )
72
73    def append_uescape(self, char_number, escape_string):
74        self.append(escape_string)
75
76    def getstring(self):
77        # this *must* return a byte string!
78        s = BytesLiteral(join_bytes(self.chars))
79        s.encoding = self.target_encoding
80        return s
81
82    def getchar(self):
83        # this *must* return a byte string!
84        return self.getstring()
85
86    def getstrings(self):
87        return (self.getstring(), None)
88
89class StrLiteralBuilder(object):
90    """Assemble both a bytes and a unicode representation of a string.
91    """
92    def __init__(self, target_encoding):
93        self._bytes   = BytesLiteralBuilder(target_encoding)
94        self._unicode = UnicodeLiteralBuilder()
95
96    def append(self, characters):
97        self._bytes.append(characters)
98        self._unicode.append(characters)
99
100    def append_charval(self, char_number):
101        self._bytes.append_charval(char_number)
102        self._unicode.append_charval(char_number)
103
104    def append_uescape(self, char_number, escape_string):
105        self._bytes.append(escape_string)
106        self._unicode.append_charval(char_number)
107
108    def getstrings(self):
109        return (self._bytes.getstring(), self._unicode.getstring())
110
111
112class EncodedString(_unicode):
113    # unicode string subclass to keep track of the original encoding.
114    # 'encoding' is None for unicode strings and the source encoding
115    # otherwise
116    encoding = None
117
118    def __deepcopy__(self, memo):
119        return self
120
121    def byteencode(self):
122        assert self.encoding is not None
123        return self.encode(self.encoding)
124
125    def utf8encode(self):
126        assert self.encoding is None
127        return self.encode("UTF-8")
128
129    @property
130    def is_unicode(self):
131        return self.encoding is None
132
133    def contains_surrogates(self):
134        return string_contains_surrogates(self)
135
136
137def string_contains_surrogates(ustring):
138    """
139    Check if the unicode string contains surrogate code points
140    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
141    Unicode, i.e. characters that would be spelled as two
142    separate code units on a narrow platform.
143    """
144    for c in map(ord, ustring):
145        if c > 65535:  # can only happen on wide platforms
146            return True
147        if 0xD800 <= c <= 0xDFFF:
148            return True
149    return False
150
151
152class BytesLiteral(_bytes):
153    # bytes subclass that is compatible with EncodedString
154    encoding = None
155
156    def __deepcopy__(self, memo):
157        return self
158
159    def byteencode(self):
160        if IS_PYTHON3:
161            return _bytes(self)
162        else:
163            # fake-recode the string to make it a plain bytes object
164            return self.decode('ISO-8859-1').encode('ISO-8859-1')
165
166    def utf8encode(self):
167        assert False, "this is not a unicode string: %r" % self
168
169    def __str__(self):
170        """Fake-decode the byte string to unicode to support %
171        formatting of unicode strings.
172        """
173        return self.decode('ISO-8859-1')
174
175    is_unicode = False
176
177
178char_from_escape_sequence = {
179    r'\a' : u'\a',
180    r'\b' : u'\b',
181    r'\f' : u'\f',
182    r'\n' : u'\n',
183    r'\r' : u'\r',
184    r'\t' : u'\t',
185    r'\v' : u'\v',
186    }.get
187
188_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
189
190
191def _to_escape_sequence(s):
192    if s in '\n\r\t':
193        return repr(s)[1:-1]
194    elif s == '"':
195        return r'\"'
196    elif s == '\\':
197        return r'\\'
198    else:
199        # within a character sequence, oct passes much better than hex
200        return ''.join(['\\%03o' % ord(c) for c in s])
201
202
203def _build_specials_replacer():
204    subexps = []
205    replacements = {}
206    for special in _c_special:
207        regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
208        subexps.append(regexp)
209        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
210    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
211    def replace_specials(m):
212        return replacements[m.group(1)]
213    def replace(s):
214        return sub(replace_specials, s)
215    return replace
216
217_replace_specials = _build_specials_replacer()
218
219
220def escape_char(c):
221    if IS_PYTHON3:
222        c = c.decode('ISO-8859-1')
223    if c in '\n\r\t\\':
224        return repr(c)[1:-1]
225    elif c == "'":
226        return "\\'"
227    n = ord(c)
228    if n < 32 or n > 127:
229        # hex works well for characters
230        return "\\x%02X" % n
231    else:
232        return c
233
234def escape_byte_string(s):
235    """Escape a byte string so that it can be written into C code.
236    Note that this returns a Unicode string instead which, when
237    encoded as ISO-8859-1, will result in the correct byte sequence
238    being written.
239    """
240    s = _replace_specials(s)
241    try:
242        return s.decode("ASCII") # trial decoding: plain ASCII => done
243    except UnicodeDecodeError:
244        pass
245    if IS_PYTHON3:
246        s_new = bytearray()
247        append, extend = s_new.append, s_new.extend
248        for b in s:
249            if b >= 128:
250                extend(('\\%3o' % b).encode('ASCII'))
251            else:
252                append(b)
253        return s_new.decode('ISO-8859-1')
254    else:
255        l = []
256        append = l.append
257        for c in s:
258            o = ord(c)
259            if o >= 128:
260                append('\\%3o' % o)
261            else:
262                append(c)
263        return join_bytes(l).decode('ISO-8859-1')
264
265def split_string_literal(s, limit=2000):
266    # MSVC can't handle long string literals.
267    if len(s) < limit:
268        return s
269    else:
270        start = 0
271        chunks = []
272        while start < len(s):
273            end = start + limit
274            if len(s) > end-4 and '\\' in s[end-4:end]:
275                end -= 4 - s[end-4:end].find('\\') # just before the backslash
276                while s[end-1] == '\\':
277                    end -= 1
278                    if end == start:
279                        # must have been a long line of backslashes
280                        end = start + limit - (limit % 2) - 4
281                        break
282            chunks.append(s[start:end])
283            start = end
284        return '""'.join(chunks)
285
286def encode_pyunicode_string(s):
287    """Create Py_UNICODE[] representation of a given unicode string.
288    """
289    s = map(ord, s) + [0]
290
291    if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
292        utf16, utf32 = [], s
293        for code_point in s:
294            if code_point >= 0x10000:  # outside of BMP
295                high, low = divmod(code_point - 0x10000, 1024)
296                utf16.append(high + 0xD800)
297                utf16.append(low + 0xDC00)
298            else:
299                utf16.append(code_point)
300    else:
301        utf16, utf32 = s, []
302        for code_unit in s:
303            if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
304                high, low = utf32[-1], code_unit
305                utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
306            else:
307                utf32.append(code_unit)
308
309    if utf16 == utf32:
310        utf16 = []
311    return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))
312