1b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik# (c) 2005 Ian Bicking and contributors; written for Paste (http://pythonpaste.org)
2b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
3b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
4b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik"""
5b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris CraikCreates a human-readable identifier, using numbers and digits,
6b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikavoiding ambiguous numbers and letters.  hash_identifier can be used
7b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikto create compact representations that are unique for a certain string
8b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik(or concatenation of strings)
9b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik"""
10b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
11b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craiktry:
12b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    from hashlib import md5
13b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikexcept ImportError:
14b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    from md5 import md5
15b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
16b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikimport six
17b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
18b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikgood_characters = "23456789abcdefghjkmnpqrtuvwxyz"
19b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
20b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikbase = len(good_characters)
21b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
22b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikdef make_identifier(number):
23b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    """
24b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    Encodes a number as an identifier.
25b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    """
26b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if not isinstance(number, six.integer_types):
27b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        raise ValueError(
28b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            "You can only make identifiers out of integers (not %r)"
29b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            % number)
30b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if number < 0:
31b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        raise ValueError(
32b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            "You cannot make identifiers out of negative numbers: %r"
33b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            % number)
34b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    result = []
35b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    while number:
36b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        next = number % base
37b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        result.append(good_characters[next])
38b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        # Note, this depends on integer rounding of results:
39b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        number = number // base
40b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    return ''.join(result)
41b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
42b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikdef hash_identifier(s, length, pad=True, hasher=md5, prefix='',
43b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik                    group=None, upper=False):
44b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    """
45b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    Hashes the string (with the given hashing module), then turns that
46b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    hash into an identifier of the given length (using modulo to
47b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    reduce the length of the identifier).  If ``pad`` is False, then
48b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    the minimum-length identifier will be used; otherwise the
49b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    identifier will be padded with 0's as necessary.
50b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
51b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ``prefix`` will be added last, and does not count towards the
52b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    target length.  ``group`` will group the characters with ``-`` in
53b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    the given lengths, and also does not count towards the target
54b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    length.  E.g., ``group=4`` will cause a identifier like
55b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ``a5f3-hgk3-asdf``.  Grouping occurs before the prefix.
56b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    """
57b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if not callable(hasher):
58b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        # Accept sha/md5 modules as well as callables
59b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        hasher = hasher.new
60b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if length > 26 and hasher is md5:
61b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        raise ValueError(
62b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            "md5 cannot create hashes longer than 26 characters in "
63b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            "length (you gave %s)" % length)
64b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if isinstance(s, six.text_type):
65b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        s = s.encode('utf-8')
66b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    elif not isinstance(s, six.binary_type):
67b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        s = str(s)
68b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        if six.PY3:
69b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            s = s.encode('utf-8')
70b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    h = hasher(s)
71b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    bin_hash = h.digest()
72b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    modulo = base ** length
73b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    number = 0
74b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    for c in list(bin_hash):
75b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        number = (number * 256 + six.byte2int([c])) % modulo
76b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ident = make_identifier(number)
77b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if pad:
78b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        ident = good_characters[0]*(length-len(ident)) + ident
79b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if group:
80b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        parts = []
81b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        while ident:
82b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            parts.insert(0, ident[-group:])
83b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik            ident = ident[:-group]
84b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        ident = '-'.join(parts)
85b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    if upper:
86b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        ident = ident.upper()
87b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    return prefix + ident
88b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
89b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik# doctest tests:
90b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik__test__ = {
91b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'make_identifier': """
92b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> make_identifier(0)
93b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ''
94b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> make_identifier(1000)
95b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'c53'
96b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> make_identifier(-100)
97b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    Traceback (most recent call last):
98b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        ...
99b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ValueError: You cannot make identifiers out of negative numbers: -100
100b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> make_identifier('test')
101b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    Traceback (most recent call last):
102b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        ...
103b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ValueError: You can only make identifiers out of integers (not 'test')
104b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> make_identifier(1000000000000)
105b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'c53x9rqh3'
106b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    """,
107b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'hash_identifier': """
108b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier(0, 5)
109b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'cy2dr'
110b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier(0, 10)
111b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'cy2dr6rg46'
112b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier('this is a test of a long string', 5)
113b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'awatu'
114b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier(0, 26)
115b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'cy2dr6rg46cx8t4w2f3nfexzk4'
116b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier(0, 30)
117b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    Traceback (most recent call last):
118b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik        ...
119b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    ValueError: md5 cannot create hashes longer than 26 characters in length (you gave 30)
120b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier(0, 10, group=4)
121b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'cy-2dr6-rg46'
122b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    >>> hash_identifier(0, 10, group=4, upper=True, prefix='M-')
123b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    'M-CY-2DR6-RG46'
124b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    """}
125b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
126b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craikif __name__ == '__main__':
127b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    import doctest
128b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik    doctest.testmod()
129b2cbf1594f8d6e4ba32d384cf379f62a74ed7654Chris Craik
130