1#! /usr/bin/env python3
2
3"""Base16, Base32, Base64 (RFC 3548), Base85 and Ascii85 data encodings"""
4
5# Modified 04-Oct-1995 by Jack Jansen to use binascii module
6# Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support
7# Modified 22-May-2007 by Guido van Rossum to use bytes everywhere
8
9import re
10import struct
11import binascii
12
13
14__all__ = [
15    # Legacy interface exports traditional RFC 2045 Base64 encodings
16    'encode', 'decode', 'encodebytes', 'decodebytes',
17    # Generalized interface for other encodings
18    'b64encode', 'b64decode', 'b32encode', 'b32decode',
19    'b16encode', 'b16decode',
20    # Base85 and Ascii85 encodings
21    'b85encode', 'b85decode', 'a85encode', 'a85decode',
22    # Standard Base64 encoding
23    'standard_b64encode', 'standard_b64decode',
24    # Some common Base64 alternatives.  As referenced by RFC 3458, see thread
25    # starting at:
26    #
27    # http://zgp.org/pipermail/p2p-hackers/2001-September/000316.html
28    'urlsafe_b64encode', 'urlsafe_b64decode',
29    ]
30
31
32bytes_types = (bytes, bytearray)  # Types acceptable as binary data
33
34def _bytes_from_decode_data(s):
35    if isinstance(s, str):
36        try:
37            return s.encode('ascii')
38        except UnicodeEncodeError:
39            raise ValueError('string argument should contain only ASCII characters')
40    if isinstance(s, bytes_types):
41        return s
42    try:
43        return memoryview(s).tobytes()
44    except TypeError:
45        raise TypeError("argument should be a bytes-like object or ASCII "
46                        "string, not %r" % s.__class__.__name__) from None
47
48
49# Base64 encoding/decoding uses binascii
50
51def b64encode(s, altchars=None):
52    """Encode the bytes-like object s using Base64 and return a bytes object.
53
54    Optional altchars should be a byte string of length 2 which specifies an
55    alternative alphabet for the '+' and '/' characters.  This allows an
56    application to e.g. generate url or filesystem safe Base64 strings.
57    """
58    encoded = binascii.b2a_base64(s, newline=False)
59    if altchars is not None:
60        assert len(altchars) == 2, repr(altchars)
61        return encoded.translate(bytes.maketrans(b'+/', altchars))
62    return encoded
63
64
65def b64decode(s, altchars=None, validate=False):
66    """Decode the Base64 encoded bytes-like object or ASCII string s.
67
68    Optional altchars must be a bytes-like object or ASCII string of length 2
69    which specifies the alternative alphabet used instead of the '+' and '/'
70    characters.
71
72    The result is returned as a bytes object.  A binascii.Error is raised if
73    s is incorrectly padded.
74
75    If validate is False (the default), characters that are neither in the
76    normal base-64 alphabet nor the alternative alphabet are discarded prior
77    to the padding check.  If validate is True, these non-alphabet characters
78    in the input result in a binascii.Error.
79    """
80    s = _bytes_from_decode_data(s)
81    if altchars is not None:
82        altchars = _bytes_from_decode_data(altchars)
83        assert len(altchars) == 2, repr(altchars)
84        s = s.translate(bytes.maketrans(altchars, b'+/'))
85    if validate and not re.match(b'^[A-Za-z0-9+/]*={0,2}$', s):
86        raise binascii.Error('Non-base64 digit found')
87    return binascii.a2b_base64(s)
88
89
90def standard_b64encode(s):
91    """Encode bytes-like object s using the standard Base64 alphabet.
92
93    The result is returned as a bytes object.
94    """
95    return b64encode(s)
96
97def standard_b64decode(s):
98    """Decode bytes encoded with the standard Base64 alphabet.
99
100    Argument s is a bytes-like object or ASCII string to decode.  The result
101    is returned as a bytes object.  A binascii.Error is raised if the input
102    is incorrectly padded.  Characters that are not in the standard alphabet
103    are discarded prior to the padding check.
104    """
105    return b64decode(s)
106
107
108_urlsafe_encode_translation = bytes.maketrans(b'+/', b'-_')
109_urlsafe_decode_translation = bytes.maketrans(b'-_', b'+/')
110
111def urlsafe_b64encode(s):
112    """Encode bytes using the URL- and filesystem-safe Base64 alphabet.
113
114    Argument s is a bytes-like object to encode.  The result is returned as a
115    bytes object.  The alphabet uses '-' instead of '+' and '_' instead of
116    '/'.
117    """
118    return b64encode(s).translate(_urlsafe_encode_translation)
119
120def urlsafe_b64decode(s):
121    """Decode bytes using the URL- and filesystem-safe Base64 alphabet.
122
123    Argument s is a bytes-like object or ASCII string to decode.  The result
124    is returned as a bytes object.  A binascii.Error is raised if the input
125    is incorrectly padded.  Characters that are not in the URL-safe base-64
126    alphabet, and are not a plus '+' or slash '/', are discarded prior to the
127    padding check.
128
129    The alphabet uses '-' instead of '+' and '_' instead of '/'.
130    """
131    s = _bytes_from_decode_data(s)
132    s = s.translate(_urlsafe_decode_translation)
133    return b64decode(s)
134
135
136
137# Base32 encoding/decoding must be done in Python
138_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
139_b32tab2 = None
140_b32rev = None
141
142def b32encode(s):
143    """Encode the bytes-like object s using Base32 and return a bytes object.
144    """
145    global _b32tab2
146    # Delay the initialization of the table to not waste memory
147    # if the function is never called
148    if _b32tab2 is None:
149        b32tab = [bytes((i,)) for i in _b32alphabet]
150        _b32tab2 = [a + b for a in b32tab for b in b32tab]
151        b32tab = None
152
153    if not isinstance(s, bytes_types):
154        s = memoryview(s).tobytes()
155    leftover = len(s) % 5
156    # Pad the last quantum with zero bits if necessary
157    if leftover:
158        s = s + b'\0' * (5 - leftover)  # Don't use += !
159    encoded = bytearray()
160    from_bytes = int.from_bytes
161    b32tab2 = _b32tab2
162    for i in range(0, len(s), 5):
163        c = from_bytes(s[i: i + 5], 'big')
164        encoded += (b32tab2[c >> 30] +           # bits 1 - 10
165                    b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20
166                    b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30
167                    b32tab2[c & 0x3ff]           # bits 31 - 40
168                   )
169    # Adjust for any leftover partial quanta
170    if leftover == 1:
171        encoded[-6:] = b'======'
172    elif leftover == 2:
173        encoded[-4:] = b'===='
174    elif leftover == 3:
175        encoded[-3:] = b'==='
176    elif leftover == 4:
177        encoded[-1:] = b'='
178    return bytes(encoded)
179
180def b32decode(s, casefold=False, map01=None):
181    """Decode the Base32 encoded bytes-like object or ASCII string s.
182
183    Optional casefold is a flag specifying whether a lowercase alphabet is
184    acceptable as input.  For security purposes, the default is False.
185
186    RFC 3548 allows for optional mapping of the digit 0 (zero) to the
187    letter O (oh), and for optional mapping of the digit 1 (one) to
188    either the letter I (eye) or letter L (el).  The optional argument
189    map01 when not None, specifies which letter the digit 1 should be
190    mapped to (when map01 is not None, the digit 0 is always mapped to
191    the letter O).  For security purposes the default is None, so that
192    0 and 1 are not allowed in the input.
193
194    The result is returned as a bytes object.  A binascii.Error is raised if
195    the input is incorrectly padded or if there are non-alphabet
196    characters present in the input.
197    """
198    global _b32rev
199    # Delay the initialization of the table to not waste memory
200    # if the function is never called
201    if _b32rev is None:
202        _b32rev = {v: k for k, v in enumerate(_b32alphabet)}
203    s = _bytes_from_decode_data(s)
204    if len(s) % 8:
205        raise binascii.Error('Incorrect padding')
206    # Handle section 2.4 zero and one mapping.  The flag map01 will be either
207    # False, or the character to map the digit 1 (one) to.  It should be
208    # either L (el) or I (eye).
209    if map01 is not None:
210        map01 = _bytes_from_decode_data(map01)
211        assert len(map01) == 1, repr(map01)
212        s = s.translate(bytes.maketrans(b'01', b'O' + map01))
213    if casefold:
214        s = s.upper()
215    # Strip off pad characters from the right.  We need to count the pad
216    # characters because this will tell us how many null bytes to remove from
217    # the end of the decoded string.
218    l = len(s)
219    s = s.rstrip(b'=')
220    padchars = l - len(s)
221    # Now decode the full quanta
222    decoded = bytearray()
223    b32rev = _b32rev
224    for i in range(0, len(s), 8):
225        quanta = s[i: i + 8]
226        acc = 0
227        try:
228            for c in quanta:
229                acc = (acc << 5) + b32rev[c]
230        except KeyError:
231            raise binascii.Error('Non-base32 digit found') from None
232        decoded += acc.to_bytes(5, 'big')
233    # Process the last, partial quanta
234    if padchars:
235        acc <<= 5 * padchars
236        last = acc.to_bytes(5, 'big')
237        if padchars == 1:
238            decoded[-5:] = last[:-1]
239        elif padchars == 3:
240            decoded[-5:] = last[:-2]
241        elif padchars == 4:
242            decoded[-5:] = last[:-3]
243        elif padchars == 6:
244            decoded[-5:] = last[:-4]
245        else:
246            raise binascii.Error('Incorrect padding')
247    return bytes(decoded)
248
249
250
251# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
252# lowercase.  The RFC also recommends against accepting input case
253# insensitively.
254def b16encode(s):
255    """Encode the bytes-like object s using Base16 and return a bytes object.
256    """
257    return binascii.hexlify(s).upper()
258
259
260def b16decode(s, casefold=False):
261    """Decode the Base16 encoded bytes-like object or ASCII string s.
262
263    Optional casefold is a flag specifying whether a lowercase alphabet is
264    acceptable as input.  For security purposes, the default is False.
265
266    The result is returned as a bytes object.  A binascii.Error is raised if
267    s is incorrectly padded or if there are non-alphabet characters present
268    in the input.
269    """
270    s = _bytes_from_decode_data(s)
271    if casefold:
272        s = s.upper()
273    if re.search(b'[^0-9A-F]', s):
274        raise binascii.Error('Non-base16 digit found')
275    return binascii.unhexlify(s)
276
277#
278# Ascii85 encoding/decoding
279#
280
281_a85chars = None
282_a85chars2 = None
283_A85START = b"<~"
284_A85END = b"~>"
285
286def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
287    # Helper function for a85encode and b85encode
288    if not isinstance(b, bytes_types):
289        b = memoryview(b).tobytes()
290
291    padding = (-len(b)) % 4
292    if padding:
293        b = b + b'\0' * padding
294    words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)
295
296    chunks = [b'z' if foldnuls and not word else
297              b'y' if foldspaces and word == 0x20202020 else
298              (chars2[word // 614125] +
299               chars2[word // 85 % 7225] +
300               chars[word % 85])
301              for word in words]
302
303    if padding and not pad:
304        if chunks[-1] == b'z':
305            chunks[-1] = chars[0] * 5
306        chunks[-1] = chunks[-1][:-padding]
307
308    return b''.join(chunks)
309
310def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
311    """Encode bytes-like object b using Ascii85 and return a bytes object.
312
313    foldspaces is an optional flag that uses the special short sequence 'y'
314    instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
315    feature is not supported by the "standard" Adobe encoding.
316
317    wrapcol controls whether the output should have newline (b'\\n') characters
318    added to it. If this is non-zero, each output line will be at most this
319    many characters long.
320
321    pad controls whether the input is padded to a multiple of 4 before
322    encoding. Note that the btoa implementation always pads.
323
324    adobe controls whether the encoded byte sequence is framed with <~ and ~>,
325    which is used by the Adobe implementation.
326    """
327    global _a85chars, _a85chars2
328    # Delay the initialization of tables to not waste memory
329    # if the function is never called
330    if _a85chars is None:
331        _a85chars = [bytes((i,)) for i in range(33, 118)]
332        _a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]
333
334    result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
335
336    if adobe:
337        result = _A85START + result
338    if wrapcol:
339        wrapcol = max(2 if adobe else 1, wrapcol)
340        chunks = [result[i: i + wrapcol]
341                  for i in range(0, len(result), wrapcol)]
342        if adobe:
343            if len(chunks[-1]) + 2 > wrapcol:
344                chunks.append(b'')
345        result = b'\n'.join(chunks)
346    if adobe:
347        result += _A85END
348
349    return result
350
351def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'):
352    """Decode the Ascii85 encoded bytes-like object or ASCII string b.
353
354    foldspaces is a flag that specifies whether the 'y' short sequence should be
355    accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is
356    not supported by the "standard" Adobe encoding.
357
358    adobe controls whether the input sequence is in Adobe Ascii85 format (i.e.
359    is framed with <~ and ~>).
360
361    ignorechars should be a byte string containing characters to ignore from the
362    input. This should only contain whitespace characters, and by default
363    contains all whitespace characters in ASCII.
364
365    The result is returned as a bytes object.
366    """
367    b = _bytes_from_decode_data(b)
368    if adobe:
369        if not b.endswith(_A85END):
370            raise ValueError(
371                "Ascii85 encoded byte sequences must end "
372                "with {!r}".format(_A85END)
373                )
374        if b.startswith(_A85START):
375            b = b[2:-2]  # Strip off start/end markers
376        else:
377            b = b[:-2]
378    #
379    # We have to go through this stepwise, so as to ignore spaces and handle
380    # special short sequences
381    #
382    packI = struct.Struct('!I').pack
383    decoded = []
384    decoded_append = decoded.append
385    curr = []
386    curr_append = curr.append
387    curr_clear = curr.clear
388    for x in b + b'u' * 4:
389        if b'!'[0] <= x <= b'u'[0]:
390            curr_append(x)
391            if len(curr) == 5:
392                acc = 0
393                for x in curr:
394                    acc = 85 * acc + (x - 33)
395                try:
396                    decoded_append(packI(acc))
397                except struct.error:
398                    raise ValueError('Ascii85 overflow') from None
399                curr_clear()
400        elif x == b'z'[0]:
401            if curr:
402                raise ValueError('z inside Ascii85 5-tuple')
403            decoded_append(b'\0\0\0\0')
404        elif foldspaces and x == b'y'[0]:
405            if curr:
406                raise ValueError('y inside Ascii85 5-tuple')
407            decoded_append(b'\x20\x20\x20\x20')
408        elif x in ignorechars:
409            # Skip whitespace
410            continue
411        else:
412            raise ValueError('Non-Ascii85 digit found: %c' % x)
413
414    result = b''.join(decoded)
415    padding = 4 - len(curr)
416    if padding:
417        # Throw away the extra padding
418        result = result[:-padding]
419    return result
420
421# The following code is originally taken (with permission) from Mercurial
422
423_b85alphabet = (b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
424                b"abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~")
425_b85chars = None
426_b85chars2 = None
427_b85dec = None
428
429def b85encode(b, pad=False):
430    """Encode bytes-like object b in base85 format and return a bytes object.
431
432    If pad is true, the input is padded with b'\\0' so its length is a multiple of
433    4 bytes before encoding.
434    """
435    global _b85chars, _b85chars2
436    # Delay the initialization of tables to not waste memory
437    # if the function is never called
438    if _b85chars is None:
439        _b85chars = [bytes((i,)) for i in _b85alphabet]
440        _b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
441    return _85encode(b, _b85chars, _b85chars2, pad)
442
443def b85decode(b):
444    """Decode the base85-encoded bytes-like object or ASCII string b
445
446    The result is returned as a bytes object.
447    """
448    global _b85dec
449    # Delay the initialization of tables to not waste memory
450    # if the function is never called
451    if _b85dec is None:
452        _b85dec = [None] * 256
453        for i, c in enumerate(_b85alphabet):
454            _b85dec[c] = i
455
456    b = _bytes_from_decode_data(b)
457    padding = (-len(b)) % 5
458    b = b + b'~' * padding
459    out = []
460    packI = struct.Struct('!I').pack
461    for i in range(0, len(b), 5):
462        chunk = b[i:i + 5]
463        acc = 0
464        try:
465            for c in chunk:
466                acc = acc * 85 + _b85dec[c]
467        except TypeError:
468            for j, c in enumerate(chunk):
469                if _b85dec[c] is None:
470                    raise ValueError('bad base85 character at position %d'
471                                    % (i + j)) from None
472            raise
473        try:
474            out.append(packI(acc))
475        except struct.error:
476            raise ValueError('base85 overflow in hunk starting at byte %d'
477                             % i) from None
478
479    result = b''.join(out)
480    if padding:
481        result = result[:-padding]
482    return result
483
484# Legacy interface.  This code could be cleaned up since I don't believe
485# binascii has any line length limitations.  It just doesn't seem worth it
486# though.  The files should be opened in binary mode.
487
488MAXLINESIZE = 76 # Excluding the CRLF
489MAXBINSIZE = (MAXLINESIZE//4)*3
490
491def encode(input, output):
492    """Encode a file; input and output are binary files."""
493    while True:
494        s = input.read(MAXBINSIZE)
495        if not s:
496            break
497        while len(s) < MAXBINSIZE:
498            ns = input.read(MAXBINSIZE-len(s))
499            if not ns:
500                break
501            s += ns
502        line = binascii.b2a_base64(s)
503        output.write(line)
504
505
506def decode(input, output):
507    """Decode a file; input and output are binary files."""
508    while True:
509        line = input.readline()
510        if not line:
511            break
512        s = binascii.a2b_base64(line)
513        output.write(s)
514
515def _input_type_check(s):
516    try:
517        m = memoryview(s)
518    except TypeError as err:
519        msg = "expected bytes-like object, not %s" % s.__class__.__name__
520        raise TypeError(msg) from err
521    if m.format not in ('c', 'b', 'B'):
522        msg = ("expected single byte elements, not %r from %s" %
523                                          (m.format, s.__class__.__name__))
524        raise TypeError(msg)
525    if m.ndim != 1:
526        msg = ("expected 1-D data, not %d-D data from %s" %
527                                          (m.ndim, s.__class__.__name__))
528        raise TypeError(msg)
529
530
531def encodebytes(s):
532    """Encode a bytestring into a bytes object containing multiple lines
533    of base-64 data."""
534    _input_type_check(s)
535    pieces = []
536    for i in range(0, len(s), MAXBINSIZE):
537        chunk = s[i : i + MAXBINSIZE]
538        pieces.append(binascii.b2a_base64(chunk))
539    return b"".join(pieces)
540
541def encodestring(s):
542    """Legacy alias of encodebytes()."""
543    import warnings
544    warnings.warn("encodestring() is a deprecated alias since 3.1, "
545                  "use encodebytes()",
546                  DeprecationWarning, 2)
547    return encodebytes(s)
548
549
550def decodebytes(s):
551    """Decode a bytestring of base-64 data into a bytes object."""
552    _input_type_check(s)
553    return binascii.a2b_base64(s)
554
555def decodestring(s):
556    """Legacy alias of decodebytes()."""
557    import warnings
558    warnings.warn("decodestring() is a deprecated alias since Python 3.1, "
559                  "use decodebytes()",
560                  DeprecationWarning, 2)
561    return decodebytes(s)
562
563
564# Usable as a script...
565def main():
566    """Small main program"""
567    import sys, getopt
568    try:
569        opts, args = getopt.getopt(sys.argv[1:], 'deut')
570    except getopt.error as msg:
571        sys.stdout = sys.stderr
572        print(msg)
573        print("""usage: %s [-d|-e|-u|-t] [file|-]
574        -d, -u: decode
575        -e: encode (default)
576        -t: encode and decode string 'Aladdin:open sesame'"""%sys.argv[0])
577        sys.exit(2)
578    func = encode
579    for o, a in opts:
580        if o == '-e': func = encode
581        if o == '-d': func = decode
582        if o == '-u': func = decode
583        if o == '-t': test(); return
584    if args and args[0] != '-':
585        with open(args[0], 'rb') as f:
586            func(f, sys.stdout.buffer)
587    else:
588        func(sys.stdin.buffer, sys.stdout.buffer)
589
590
591def test():
592    s0 = b"Aladdin:open sesame"
593    print(repr(s0))
594    s1 = encodebytes(s0)
595    print(repr(s1))
596    s2 = decodebytes(s1)
597    print(repr(s2))
598    assert s0 == s2
599
600
601if __name__ == '__main__':
602    main()
603