13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" Standard "encodings" Package
23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    Standard Python encoding modules are stored in this package
43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    directory.
53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    Codec modules must have names corresponding to normalized encoding
73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    names as defined in the normalize_encoding() function below, e.g.
83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    'utf-8' must be implemented by the module 'utf_8.py'.
93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    Each codec module must export the following interface:
113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    * getregentry() -> codecs.CodecInfo object
133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    The getregentry() API must a CodecInfo object with encoder, decoder,
143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    incrementalencoder, incrementaldecoder, streamwriter and streamreader
153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    atttributes which adhere to the Python Codec Interface Standard.
163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    In addition, a module may optionally also define the following
183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    APIs which are then used by the package's codec search function:
193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    * getaliases() -> sequence of encoding name strings to use as aliases
213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    Alias names returned by getaliases() must be normalized encoding
233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    names as defined by normalize_encoding().
243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielWritten by Marc-Andre Lemburg (mal@lemburg.com).
263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""#"
303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport codecs
323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom encodings import aliases
333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport __builtin__
343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_cache = {}
363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_unknown = '--unknown--'
373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_import_tail = ['*']
383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_norm_encoding_map = ('                                              . '
393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      ' abcdefghijklmnopqrstuvwxyz                     '
413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '                                                '
423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '                                                '
433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                      '                ')
443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_aliases = aliases.aliases
453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass CodecRegistryError(LookupError, SystemError):
473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    pass
483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef normalize_encoding(encoding):
503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    """ Normalize an encoding name.
523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        Normalization works as follows: all non-alphanumeric
543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        characters except the dot used for Python package names are
553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        collapsed and replaced with a single underscore, e.g. '  -;#'
563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        becomes '_'. Leading and trailing underscores are removed.
573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        Note that encoding names should be ASCII only; if they do use
593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        non-ASCII characters, these must be Latin-1 compatible.
603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    """
623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Make sure we have an 8-bit string, because .translate() works
633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # differently for Unicode strings.
643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # Note that .encode('latin-1') does *not* use the codec
663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # registry, so this call doesn't recurse. (See unicodeobject.c
673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # PyUnicode_AsEncodedString() for details)
683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        encoding = encoding.encode('latin-1')
693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    return '_'.join(encoding.translate(_norm_encoding_map).split())
703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef search_function(encoding):
723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Cache lookup
743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    entry = _cache.get(encoding, _unknown)
753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if entry is not _unknown:
763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return entry
773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Import the module:
793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    #
803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # First try to find an alias for the normalized encoding
813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # name and lookup the module using the aliased name, then try to
823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # lookup the module using the standard import scheme, i.e. first
833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # try in the encodings package, then at top-level.
843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    #
853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    norm_encoding = normalize_encoding(encoding)
863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    aliased_encoding = _aliases.get(norm_encoding) or \
873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                       _aliases.get(norm_encoding.replace('.', '_'))
883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if aliased_encoding is not None:
893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        modnames = [aliased_encoding,
903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                    norm_encoding]
913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    else:
923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        modnames = [norm_encoding]
933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    for modname in modnames:
943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not modname or '.' in modname:
953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            continue
963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        try:
973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # Import is absolute to prevent the possibly malicious import of a
983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            # module with side-effects that is not in the 'encodings' package.
993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            mod = __import__('encodings.' + modname, fromlist=_import_tail,
1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                             level=0)
1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        except ImportError:
1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            pass
1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        else:
1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            break
1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    else:
1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        mod = None
1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    try:
1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        getregentry = mod.getregentry
1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    except AttributeError:
1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # Not a codec module
1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        mod = None
1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if mod is None:
1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        # Cache misses
1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        _cache[encoding] = None
1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        return None
1183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Now ask the module for the registry entry
1203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    entry = getregentry()
1213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    if not isinstance(entry, codecs.CodecInfo):
1223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not 4 <= len(entry) <= 7:
1233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise CodecRegistryError,\
1243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                 'module "%s" (%s) failed to register' % \
1253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                  (mod.__name__, mod.__file__)
1263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if not hasattr(entry[0], '__call__') or \
1273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           not hasattr(entry[1], '__call__') or \
1283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
1293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
1303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
1313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel           (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
1323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            raise CodecRegistryError,\
1333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                'incompatible codecs in module "%s" (%s)' % \
1343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                (mod.__name__, mod.__file__)
1353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        if len(entry)<7 or entry[6] is None:
1363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
1373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        entry = codecs.CodecInfo(*entry)
1383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Cache the codec registry entry
1403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    _cache[encoding] = entry
1413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Register its aliases (without overwriting previously registered
1433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # aliases)
1443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    try:
1453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        codecaliases = mod.getaliases()
1463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    except AttributeError:
1473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        pass
1483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    else:
1493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel        for alias in codecaliases:
1503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel            if alias not in _aliases:
1513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel                _aliases[alias] = modname
1523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    # Return the registry entry
1543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel    return entry
1553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel
1563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Register the search_function in the Python codec registry
1573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcodecs.register(search_function)
158