13257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel""" Standard "encodings" Package 23257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 33257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Standard Python encoding modules are stored in this package 43257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel directory. 53257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 63257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Codec modules must have names corresponding to normalized encoding 73257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel names as defined in the normalize_encoding() function below, e.g. 83257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'utf-8' must be implemented by the module 'utf_8.py'. 93257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Each codec module must export the following interface: 113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel * getregentry() -> codecs.CodecInfo object 133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel The getregentry() API must a CodecInfo object with encoder, decoder, 143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel incrementalencoder, incrementaldecoder, streamwriter and streamreader 153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel atttributes which adhere to the Python Codec Interface Standard. 163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel In addition, a module may optionally also define the following 183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel APIs which are then used by the package's codec search function: 193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel * getaliases() -> sequence of encoding name strings to use as aliases 213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Alias names returned by getaliases() must be normalized encoding 233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel names as defined by normalize_encoding(). 243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielWritten by Marc-Andre Lemburg (mal@lemburg.com). 263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel"""#" 303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport codecs 323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielfrom encodings import aliases 333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielimport __builtin__ 343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_cache = {} 363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_unknown = '--unknown--' 373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_import_tail = ['*'] 383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_norm_encoding_map = (' . ' 393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' 403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ' abcdefghijklmnopqrstuvwxyz ' 413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ' ' 423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ' ' 433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel ' ') 443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel_aliases = aliases.aliases 453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielclass CodecRegistryError(LookupError, SystemError): 473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef normalize_encoding(encoding): 503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ Normalize an encoding name. 523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Normalization works as follows: all non-alphanumeric 543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel characters except the dot used for Python package names are 553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel collapsed and replaced with a single underscore, e.g. ' -;#' 563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel becomes '_'. Leading and trailing underscores are removed. 573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 583257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel Note that encoding names should be ASCII only; if they do use 593257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel non-ASCII characters, these must be Latin-1 compatible. 603257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 613257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel """ 623257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Make sure we have an 8-bit string, because .translate() works 633257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # differently for Unicode strings. 643257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode): 653257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Note that .encode('latin-1') does *not* use the codec 663257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # registry, so this call doesn't recurse. (See unicodeobject.c 673257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # PyUnicode_AsEncodedString() for details) 683257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel encoding = encoding.encode('latin-1') 693257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return '_'.join(encoding.translate(_norm_encoding_map).split()) 703257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 713257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanieldef search_function(encoding): 723257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 733257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Cache lookup 743257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel entry = _cache.get(encoding, _unknown) 753257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if entry is not _unknown: 763257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return entry 773257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 783257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Import the module: 793257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # 803257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # First try to find an alias for the normalized encoding 813257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # name and lookup the module using the aliased name, then try to 823257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # lookup the module using the standard import scheme, i.e. first 833257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # try in the encodings package, then at top-level. 843257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # 853257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel norm_encoding = normalize_encoding(encoding) 863257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel aliased_encoding = _aliases.get(norm_encoding) or \ 873257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _aliases.get(norm_encoding.replace('.', '_')) 883257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if aliased_encoding is not None: 893257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modnames = [aliased_encoding, 903257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel norm_encoding] 913257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 923257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel modnames = [norm_encoding] 933257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for modname in modnames: 943257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not modname or '.' in modname: 953257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel continue 963257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 973257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Import is absolute to prevent the possibly malicious import of a 983257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # module with side-effects that is not in the 'encodings' package. 993257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel mod = __import__('encodings.' + modname, fromlist=_import_tail, 1003257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel level=0) 1013257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except ImportError: 1023257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 1033257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 1043257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel break 1053257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 1063257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel mod = None 1073257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1083257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 1093257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel getregentry = mod.getregentry 1103257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except AttributeError: 1113257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Not a codec module 1123257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel mod = None 1133257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1143257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if mod is None: 1153257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Cache misses 1163257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _cache[encoding] = None 1173257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return None 1183257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1193257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Now ask the module for the registry entry 1203257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel entry = getregentry() 1213257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not isinstance(entry, codecs.CodecInfo): 1223257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not 4 <= len(entry) <= 7: 1233257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise CodecRegistryError,\ 1243257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'module "%s" (%s) failed to register' % \ 1253257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (mod.__name__, mod.__file__) 1263257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if not hasattr(entry[0], '__call__') or \ 1273257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel not hasattr(entry[1], '__call__') or \ 1283257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (entry[2] is not None and not hasattr(entry[2], '__call__')) or \ 1293257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (entry[3] is not None and not hasattr(entry[3], '__call__')) or \ 1303257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \ 1313257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')): 1323257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel raise CodecRegistryError,\ 1333257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 'incompatible codecs in module "%s" (%s)' % \ 1343257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel (mod.__name__, mod.__file__) 1353257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if len(entry)<7 or entry[6] is None: 1363257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],) 1373257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel entry = codecs.CodecInfo(*entry) 1383257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1393257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Cache the codec registry entry 1403257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _cache[encoding] = entry 1413257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1423257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Register its aliases (without overwriting previously registered 1433257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # aliases) 1443257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel try: 1453257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel codecaliases = mod.getaliases() 1463257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel except AttributeError: 1473257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel pass 1483257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel else: 1493257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel for alias in codecaliases: 1503257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel if alias not in _aliases: 1513257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel _aliases[alias] = modname 1523257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1533257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel # Return the registry entry 1543257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel return entry 1553257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel 1563257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDaniel# Register the search_function in the Python codec registry 1573257aa99321d745773a6bd1bd4ce7f6fafe74411Daryl McDanielcodecs.register(search_function) 158