14710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Copyright (C) 2001-2006 Python Software Foundation 24710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Author: Ben Gertzfield, Barry Warsaw 34710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Contact: email-sig@python.org 44710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 54710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm__all__ = [ 64710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'Charset', 74710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'add_alias', 84710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'add_charset', 94710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'add_codec', 104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm ] 114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmimport codecs 134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmimport email.base64mime 144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmimport email.quoprimime 154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom email import errors 174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmfrom email.encoders import encode_7or8bit 184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Flags for types of header encodings 224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmQP = 1 # Quoted-Printable 234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmBASE64 = 2 # Base64 244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmSHORTEST = 3 # the shorter of QP and base64, but only for headers 254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmMISC_LEN = 7 284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmDEFAULT_CHARSET = 'us-ascii' 304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Defaults 344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmCHARSETS = { 354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # input header enc body enc output conv 364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-1': (QP, QP, None), 374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-2': (QP, QP, None), 384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-3': (QP, QP, None), 394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-4': (QP, QP, None), 404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # iso-8859-5 is Cyrillic, and not especially used 414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # iso-8859-6 is Arabic, also not particularly used 424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # iso-8859-7 is Greek, QP will not make it readable 434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # iso-8859-8 is Hebrew, QP will not make it readable 444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-9': (QP, QP, None), 454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-10': (QP, QP, None), 464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # iso-8859-11 is Thai, QP will not make it readable 474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-13': (QP, QP, None), 484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-14': (QP, QP, None), 494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-15': (QP, QP, None), 504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-8859-16': (QP, QP, None), 514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'windows-1252':(QP, QP, None), 524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'viscii': (QP, QP, None), 534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'us-ascii': (None, None, None), 544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'big5': (BASE64, BASE64, None), 554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'gb2312': (BASE64, BASE64, None), 564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'euc-jp': (BASE64, None, 'iso-2022-jp'), 574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'shift_jis': (BASE64, None, 'iso-2022-jp'), 584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'iso-2022-jp': (BASE64, None, None), 594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'koi8-r': (BASE64, BASE64, None), 604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'utf-8': (SHORTEST, BASE64, 'utf-8'), 614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # We're making this one up to represent raw unencoded 8-bit 624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm '8bit': (None, BASE64, 'utf-8'), 634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm } 644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Aliases for other commonly-used names for character sets. Map 664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# them to the real ones used in email. 674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmALIASES = { 684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_1': 'iso-8859-1', 694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-1': 'iso-8859-1', 704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_2': 'iso-8859-2', 714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-2': 'iso-8859-2', 724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_3': 'iso-8859-3', 734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-3': 'iso-8859-3', 744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_4': 'iso-8859-4', 754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-4': 'iso-8859-4', 764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_5': 'iso-8859-9', 774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-5': 'iso-8859-9', 784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_6': 'iso-8859-10', 794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-6': 'iso-8859-10', 804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_7': 'iso-8859-13', 814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-7': 'iso-8859-13', 824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_8': 'iso-8859-14', 834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-8': 'iso-8859-14', 844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_9': 'iso-8859-15', 854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-9': 'iso-8859-15', 864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin_10':'iso-8859-16', 874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'latin-10':'iso-8859-16', 884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'cp949': 'ks_c_5601-1987', 894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'euc_jp': 'euc-jp', 904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'euc_kr': 'euc-kr', 914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'ascii': 'us-ascii', 924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm } 934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Map charsets to their Unicode codec strings. 964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmCODEC_MAP = { 974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'gb2312': 'eucgb2312_cn', 984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'big5': 'big5_tw', 994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Hack: We don't want *any* conversion for stuff marked us-ascii, as all 1004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. 1014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Let that stuff pass through without conversion to/from Unicode. 1024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 'us-ascii': None, 1034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm } 1044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm# Convenience functions for extending the above mappings 1084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None): 1094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Add character set properties to the global registry. 1104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm charset is the input character set, and must be the canonical name of a 1124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm character set. 1134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Optional header_enc and body_enc is either Charset.QP for 1154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for 1164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm the shortest of qp or base64 encoding, or None for no encoding. SHORTEST 1174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm is only valid for header_enc. It describes how message headers and 1184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm message bodies in the input charset are to be encoded. Default is no 1194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm encoding. 1204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Optional output_charset is the character set that the output should be 1224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm in. Conversions will proceed from input charset, to Unicode, to the 1234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm output charset when the method Charset.convert() is called. The default 1244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm is to output in the same character set as the input. 1254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Both input_charset and output_charset must have Unicode codec entries in 1274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm the module's charset-to-codec mapping; use add_codec(charset, codecname) 1284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm to add codecs the module does not know about. See the codecs module's 1294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm documentation for more information. 1304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 1314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if body_enc == SHORTEST: 1324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise ValueError('SHORTEST not allowed for body_enc') 1334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm CHARSETS[charset] = (header_enc, body_enc, output_charset) 1344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef add_alias(alias, canonical): 1374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Add a character set alias. 1384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm alias is the alias name, e.g. latin-1 1404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm canonical is the character set's canonical name, e.g. iso-8859-1 1414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 1424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm ALIASES[alias] = canonical 1434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmdef add_codec(charset, codecname): 1464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Add a codec that map characters in the given charset to/from Unicode. 1474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm charset is the canonical name of a character set. codecname is the name 1494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm of a Python codec, as appropriate for the second argument to the unicode() 1504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm built-in, or to the encode() method of a Unicode string. 1514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 1524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm CODEC_MAP[charset] = codecname 1534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylmclass Charset: 1574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Map character sets to their email properties. 1584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm This class provides information about the requirements imposed on email 1604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm for a specific character set. It also provides convenience routines for 1614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm converting between character sets, given the availability of the 1624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm applicable codecs. Given a character set, it will do its best to provide 1634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm information on how to use that character set in an email in an 1644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm RFC-compliant way. 1654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Certain character sets must be encoded with quoted-printable or base64 1674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm when used in email headers or bodies. Certain character sets must be 1684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm converted outright, and are not allowed in email. Instances of this 1694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm module expose the following information about a character set: 1704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_charset: The initial character set specified. Common aliases 1724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm are converted to their `official' email names (e.g. latin_1 1734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm is converted to iso-8859-1). Defaults to 7-bit us-ascii. 1744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm header_encoding: If the character set must be encoded before it can be 1764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm used in an email header, this attribute will be set to 1774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Charset.QP (for quoted-printable), Charset.BASE64 (for 1784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm base64 encoding), or Charset.SHORTEST for the shortest of 1794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm QP or BASE64 encoding. Otherwise, it will be None. 1804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm body_encoding: Same as header_encoding, but describes the encoding for the 1824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm mail message's body, which indeed may be different than the 1834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm header encoding. Charset.SHORTEST is not allowed for 1844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm body_encoding. 1854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm output_charset: Some character sets must be converted before the can be 1874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm used in email headers or bodies. If the input_charset is 1884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm one of them, this attribute will contain the name of the 1894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm charset output will be converted to. Otherwise, it will 1904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm be None. 1914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_codec: The name of the Python codec used to convert the 1934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_charset to Unicode. If no conversion codec is 1944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm necessary, this attribute will be None. 1954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 1964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm output_codec: The name of the Python codec used to convert Unicode 1974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm to the output_charset. If no conversion codec is necessary, 1984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm this attribute will have the same value as the input_codec. 1994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 2004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __init__(self, input_charset=DEFAULT_CHARSET): 2014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to 2024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # unicode because its .lower() is locale insensitive. If the argument 2034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # is already a unicode, we leave it at that, but ensure that the 2044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # charset is ASCII, as the standard (RFC XXX) requires. 2054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 2064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if isinstance(input_charset, unicode): 2074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_charset.encode('ascii') 2084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 2094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_charset = unicode(input_charset, 'ascii') 2104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except UnicodeError: 2114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm raise errors.CharsetError(input_charset) 2124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_charset = input_charset.lower().encode('ascii') 2134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Set the input charset after filtering through the aliases and/or codecs 2144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not (input_charset in ALIASES or input_charset in CHARSETS): 2154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 2164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm input_charset = codecs.lookup(input_charset).name 2174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except LookupError: 2184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm pass 2194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.input_charset = ALIASES.get(input_charset, input_charset) 2204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # We can try to guess which encoding and conversion to use by the 2214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # charset_map dictionary. Try that first, but let the user override 2224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # it. 2234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm henc, benc, conv = CHARSETS.get(self.input_charset, 2244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm (SHORTEST, BASE64, None)) 2254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not conv: 2264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm conv = self.input_charset 2274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Set the attributes, allowing the arguments to override the default. 2284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.header_encoding = henc 2294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.body_encoding = benc 2304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.output_charset = ALIASES.get(conv, conv) 2314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Now set the codecs. If one isn't defined for input_charset, 2324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # guess and try a Unicode codec with the same name as input_codec. 2334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.input_codec = CODEC_MAP.get(self.input_charset, 2344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.input_charset) 2354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.output_codec = CODEC_MAP.get(self.output_charset, 2364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.output_charset) 2374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __str__(self): 2394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return self.input_charset.lower() 2404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm __repr__ = __str__ 2424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __eq__(self, other): 2444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return str(self) == str(other).lower() 2454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def __ne__(self, other): 2474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return not self.__eq__(other) 2484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def get_body_encoding(self): 2504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Return the content-transfer-encoding used for body encoding. 2514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm This is either the string `quoted-printable' or `base64' depending on 2534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm the encoding used, or it is a function in which case you should call 2544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm the function with a single argument, the Message object being 2554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm encoded. The function should then set the Content-Transfer-Encoding 2564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm header itself to whatever is appropriate. 2574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Returns "quoted-printable" if self.body_encoding is QP. 2594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Returns "base64" if self.body_encoding is BASE64. 2604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Returns "7bit" otherwise. 2614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 2624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm assert self.body_encoding != SHORTEST 2634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.body_encoding == QP: 2644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return 'quoted-printable' 2654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif self.body_encoding == BASE64: 2664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return 'base64' 2674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 2684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return encode_7or8bit 2694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def convert(self, s): 2714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Convert a string from the input_codec to the output_codec.""" 2724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.input_codec != self.output_codec: 2734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return unicode(s, self.input_codec).encode(self.output_codec) 2744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 2754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return s 2764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def to_splittable(self, s): 2784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Convert a possibly multibyte string to a safely splittable format. 2794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Uses the input_codec to try and convert the string to Unicode, so it 2814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm can be safely split on character boundaries (even for multibyte 2824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm characters). 2834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Returns the string as-is if it isn't known how to convert it to 2854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Unicode with the input_charset. 2864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Characters that could not be converted to Unicode will be replaced 2884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm with the Unicode replacement character U+FFFD. 2894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 2904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if isinstance(s, unicode) or self.input_codec is None: 2914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return s 2924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 2934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return unicode(s, self.input_codec, 'replace') 2944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except LookupError: 2954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Input codec not installed on system, so return the original 2964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # string unchanged. 2974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return s 2984710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 2994710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def from_splittable(self, ustr, to_output=True): 3004710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Convert a splittable string back into an encoded string. 3014710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3024710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Uses the proper codec to try and convert the string from Unicode back 3034710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm into an encoded format. Return the string as-is if it is not Unicode, 3044710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm or if it could not be converted from Unicode. 3054710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3064710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm Characters that could not be converted from Unicode will be replaced 3074710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm with an appropriate character (usually '?'). 3084710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3094710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm If to_output is True (the default), uses output_codec to convert to an 3104710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm encoded format. If to_output is False, uses input_codec. 3114710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 3124710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if to_output: 3134710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm codec = self.output_codec 3144710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3154710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm codec = self.input_codec 3164710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if not isinstance(ustr, unicode) or codec is None: 3174710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return ustr 3184710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm try: 3194710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return ustr.encode(codec, 'replace') 3204710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm except LookupError: 3214710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # Output codec not installed 3224710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return ustr 3234710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3244710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def get_output_charset(self): 3254710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Return the output character set. 3264710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3274710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm This is self.output_charset if that is not None, otherwise it is 3284710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.input_charset. 3294710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 3304710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return self.output_charset or self.input_charset 3314710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3324710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def encoded_header_len(self, s): 3334710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Return the length of the encoded header string.""" 3344710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm cset = self.get_output_charset() 3354710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # The len(s) of a 7bit encoding is len(s) 3364710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.header_encoding == BASE64: 3374710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.base64mime.base64_len(s) + len(cset) + MISC_LEN 3384710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif self.header_encoding == QP: 3394710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN 3404710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif self.header_encoding == SHORTEST: 3414710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lenb64 = email.base64mime.base64_len(s) 3424710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lenqp = email.quoprimime.header_quopri_len(s) 3434710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return min(lenb64, lenqp) + len(cset) + MISC_LEN 3444710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3454710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return len(s) 3464710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3474710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def header_encode(self, s, convert=False): 3484710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Header-encode a string, optionally converting it to output_charset. 3494710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3504710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm If convert is True, the string will be converted from the input 3514710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm charset to the output charset automatically. This is not useful for 3524710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm multibyte character sets, which have line length issues (multibyte 3534710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm characters must be split on a character, not a byte boundary); use the 3544710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm high-level Header class to deal with these issues. convert defaults 3554710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm to False. 3564710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3574710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm The type of encoding (base64 or quoted-printable) will be based on 3584710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.header_encoding. 3594710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 3604710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm cset = self.get_output_charset() 3614710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if convert: 3624710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm s = self.convert(s) 3634710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # 7bit/8bit encodings return the string unchanged (modulo conversions) 3644710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.header_encoding == BASE64: 3654710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.base64mime.header_encode(s, cset) 3664710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif self.header_encoding == QP: 3674710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.quoprimime.header_encode(s, cset, maxlinelen=None) 3684710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif self.header_encoding == SHORTEST: 3694710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lenb64 = email.base64mime.base64_len(s) 3704710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm lenqp = email.quoprimime.header_quopri_len(s) 3714710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if lenb64 < lenqp: 3724710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.base64mime.header_encode(s, cset) 3734710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3744710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.quoprimime.header_encode(s, cset, maxlinelen=None) 3754710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3764710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return s 3774710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3784710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm def body_encode(self, s, convert=True): 3794710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """Body-encode a string and convert it to output_charset. 3804710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3814710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm If convert is True (the default), the string will be converted from 3824710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm the input charset to output charset automatically. Unlike 3834710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm header_encode(), there are no issues with byte boundaries and 3844710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm multibyte charsets in email bodies, so this is usually pretty safe. 3854710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm 3864710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm The type of encoding (base64 or quoted-printable) will be based on 3874710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm self.body_encoding. 3884710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm """ 3894710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if convert: 3904710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm s = self.convert(s) 3914710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm # 7bit/8bit encodings return the string unchanged (module conversions) 3924710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm if self.body_encoding is BASE64: 3934710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.base64mime.body_encode(s) 3944710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm elif self.body_encoding is QP: 3954710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return email.quoprimime.body_encode(s) 3964710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm else: 3974710c53dcad1ebf3755f3efb9e80ac24bd72a9b2darylm return s 398