1edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Copyright (C) 2001-2006 Python Software Foundation 2edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Author: Ben Gertzfield, Barry Warsaw 3edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Contact: email-sig@python.org 4edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 5edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep__all__ = [ 6edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'Charset', 7edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'add_alias', 8edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'add_charset', 9edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'add_codec', 10edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep ] 11edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 12edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepimport codecs 13edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepimport email.base64mime 14edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepimport email.quoprimime 15edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 16edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepfrom email import errors 17edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepfrom email.encoders import encode_7or8bit 18edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 19edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 20edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 21edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Flags for types of header encodings 22edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepQP = 1 # Quoted-Printable 23edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepBASE64 = 2 # Base64 24edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepSHORTEST = 3 # the shorter of QP and base64, but only for headers 25edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 26edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 27edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepMISC_LEN = 7 28edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 29edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepDEFAULT_CHARSET = 'us-ascii' 30edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 31edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 32edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 33edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Defaults 34edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepCHARSETS = { 35edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # input header enc body enc output conv 36edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-1': (QP, QP, None), 37edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-2': (QP, QP, None), 38edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-3': (QP, QP, None), 39edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-4': (QP, QP, None), 40edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # iso-8859-5 is Cyrillic, and not especially used 41edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # iso-8859-6 is Arabic, also not particularly used 42edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # iso-8859-7 is Greek, QP will not make it readable 43edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # iso-8859-8 is Hebrew, QP will not make it readable 44edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-9': (QP, QP, None), 45edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-10': (QP, QP, None), 46edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # iso-8859-11 is Thai, QP will not make it readable 47edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-13': (QP, QP, None), 48edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-14': (QP, QP, None), 49edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-15': (QP, QP, None), 50edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-8859-16': (QP, QP, None), 51edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'windows-1252':(QP, QP, None), 52edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'viscii': (QP, QP, None), 53edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'us-ascii': (None, None, None), 54edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'big5': (BASE64, BASE64, None), 55edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'gb2312': (BASE64, BASE64, None), 56edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'euc-jp': (BASE64, None, 'iso-2022-jp'), 57edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'shift_jis': (BASE64, None, 'iso-2022-jp'), 58edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'iso-2022-jp': (BASE64, None, None), 59edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'koi8-r': (BASE64, BASE64, None), 60edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'utf-8': (SHORTEST, BASE64, 'utf-8'), 61edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # We're making this one up to represent raw unencoded 8-bit 62edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep '8bit': (None, BASE64, 'utf-8'), 63edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep } 64edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 65edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Aliases for other commonly-used names for character sets. Map 66edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# them to the real ones used in email. 67edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepALIASES = { 68edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_1': 'iso-8859-1', 69edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-1': 'iso-8859-1', 70edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_2': 'iso-8859-2', 71edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-2': 'iso-8859-2', 72edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_3': 'iso-8859-3', 73edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-3': 'iso-8859-3', 74edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_4': 'iso-8859-4', 75edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-4': 'iso-8859-4', 76edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_5': 'iso-8859-9', 77edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-5': 'iso-8859-9', 78edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_6': 'iso-8859-10', 79edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-6': 'iso-8859-10', 80edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_7': 'iso-8859-13', 81edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-7': 'iso-8859-13', 82edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_8': 'iso-8859-14', 83edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-8': 'iso-8859-14', 84edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_9': 'iso-8859-15', 85edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-9': 'iso-8859-15', 86edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin_10':'iso-8859-16', 87edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'latin-10':'iso-8859-16', 88edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'cp949': 'ks_c_5601-1987', 89edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'euc_jp': 'euc-jp', 90edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'euc_kr': 'euc-kr', 91edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'ascii': 'us-ascii', 92edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep } 93edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 94edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 95edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Map charsets to their Unicode codec strings. 96edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepCODEC_MAP = { 97edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'gb2312': 'eucgb2312_cn', 98edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'big5': 'big5_tw', 99edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Hack: We don't want *any* conversion for stuff marked us-ascii, as all 100edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. 101edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Let that stuff pass through without conversion to/from Unicode. 102edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 'us-ascii': None, 103edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep } 104edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 105edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 106edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 107edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Convenience functions for extending the above mappings 108edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None): 109edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Add character set properties to the global registry. 110edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 111edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep charset is the input character set, and must be the canonical name of a 112edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep character set. 113edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 114edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Optional header_enc and body_enc is either Charset.QP for 115edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for 116edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep the shortest of qp or base64 encoding, or None for no encoding. SHORTEST 117edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep is only valid for header_enc. It describes how message headers and 118edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep message bodies in the input charset are to be encoded. Default is no 119edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep encoding. 120edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 121edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Optional output_charset is the character set that the output should be 122edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep in. Conversions will proceed from input charset, to Unicode, to the 123edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep output charset when the method Charset.convert() is called. The default 124edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep is to output in the same character set as the input. 125edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 126edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Both input_charset and output_charset must have Unicode codec entries in 127edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep the module's charset-to-codec mapping; use add_codec(charset, codecname) 128edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep to add codecs the module does not know about. See the codecs module's 129edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep documentation for more information. 130edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 131edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if body_enc == SHORTEST: 132edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep raise ValueError('SHORTEST not allowed for body_enc') 133edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep CHARSETS[charset] = (header_enc, body_enc, output_charset) 134edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 135edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 136edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepdef add_alias(alias, canonical): 137edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Add a character set alias. 138edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 139edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep alias is the alias name, e.g. latin-1 140edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep canonical is the character set's canonical name, e.g. iso-8859-1 141edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 142edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep ALIASES[alias] = canonical 143edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 144edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 145edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepdef add_codec(charset, codecname): 146edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Add a codec that map characters in the given charset to/from Unicode. 147edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 148edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep charset is the canonical name of a character set. codecname is the name 149edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep of a Python codec, as appropriate for the second argument to the unicode() 150edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep built-in, or to the encode() method of a Unicode string. 151edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 152edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep CODEC_MAP[charset] = codecname 153edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 154edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 155edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 156edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepclass Charset: 157edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Map character sets to their email properties. 158edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 159edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep This class provides information about the requirements imposed on email 160edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep for a specific character set. It also provides convenience routines for 161edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep converting between character sets, given the availability of the 162edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep applicable codecs. Given a character set, it will do its best to provide 163edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep information on how to use that character set in an email in an 164edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep RFC-compliant way. 165edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 166edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Certain character sets must be encoded with quoted-printable or base64 167edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep when used in email headers or bodies. Certain character sets must be 168edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep converted outright, and are not allowed in email. Instances of this 169edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep module expose the following information about a character set: 170edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 171edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_charset: The initial character set specified. Common aliases 172edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep are converted to their `official' email names (e.g. latin_1 173edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep is converted to iso-8859-1). Defaults to 7-bit us-ascii. 174edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 175edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep header_encoding: If the character set must be encoded before it can be 176edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep used in an email header, this attribute will be set to 177edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Charset.QP (for quoted-printable), Charset.BASE64 (for 178edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep base64 encoding), or Charset.SHORTEST for the shortest of 179edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep QP or BASE64 encoding. Otherwise, it will be None. 180edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 181edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep body_encoding: Same as header_encoding, but describes the encoding for the 182edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep mail message's body, which indeed may be different than the 183edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep header encoding. Charset.SHORTEST is not allowed for 184edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep body_encoding. 185edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 186edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep output_charset: Some character sets must be converted before the can be 187edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep used in email headers or bodies. If the input_charset is 188edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep one of them, this attribute will contain the name of the 189edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep charset output will be converted to. Otherwise, it will 190edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep be None. 191edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 192edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_codec: The name of the Python codec used to convert the 193edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_charset to Unicode. If no conversion codec is 194edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep necessary, this attribute will be None. 195edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 196edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep output_codec: The name of the Python codec used to convert Unicode 197edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep to the output_charset. If no conversion codec is necessary, 198edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep this attribute will have the same value as the input_codec. 199edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 200edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def __init__(self, input_charset=DEFAULT_CHARSET): 201edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to 202edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # unicode because its .lower() is locale insensitive. If the argument 203edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # is already a unicode, we leave it at that, but ensure that the 204edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # charset is ASCII, as the standard (RFC XXX) requires. 205edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep try: 206edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if isinstance(input_charset, unicode): 207edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_charset.encode('ascii') 208edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 209edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_charset = unicode(input_charset, 'ascii') 210edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep except UnicodeError: 211edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep raise errors.CharsetError(input_charset) 212edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_charset = input_charset.lower().encode('ascii') 213edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Set the input charset after filtering through the aliases and/or codecs 214edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if not (input_charset in ALIASES or input_charset in CHARSETS): 215edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep try: 216edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep input_charset = codecs.lookup(input_charset).name 217edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep except LookupError: 218edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep pass 219edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.input_charset = ALIASES.get(input_charset, input_charset) 220edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # We can try to guess which encoding and conversion to use by the 221edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # charset_map dictionary. Try that first, but let the user override 222edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # it. 223edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep henc, benc, conv = CHARSETS.get(self.input_charset, 224edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep (SHORTEST, BASE64, None)) 225edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if not conv: 226edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep conv = self.input_charset 227edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Set the attributes, allowing the arguments to override the default. 228edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.header_encoding = henc 229edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.body_encoding = benc 230edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.output_charset = ALIASES.get(conv, conv) 231edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Now set the codecs. If one isn't defined for input_charset, 232edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # guess and try a Unicode codec with the same name as input_codec. 233edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.input_codec = CODEC_MAP.get(self.input_charset, 234edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.input_charset) 235edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.output_codec = CODEC_MAP.get(self.output_charset, 236edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.output_charset) 237edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 238edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def __str__(self): 239edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return self.input_charset.lower() 240edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 241edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep __repr__ = __str__ 242edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 243edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def __eq__(self, other): 244edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return str(self) == str(other).lower() 245edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 246edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def __ne__(self, other): 247edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return not self.__eq__(other) 248edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 249edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def get_body_encoding(self): 250edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Return the content-transfer-encoding used for body encoding. 251edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 252edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep This is either the string `quoted-printable' or `base64' depending on 253edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep the encoding used, or it is a function in which case you should call 254edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep the function with a single argument, the Message object being 255edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep encoded. The function should then set the Content-Transfer-Encoding 256edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep header itself to whatever is appropriate. 257edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 258edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Returns "quoted-printable" if self.body_encoding is QP. 259edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Returns "base64" if self.body_encoding is BASE64. 260edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Returns "7bit" otherwise. 261edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 262edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep assert self.body_encoding != SHORTEST 263edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if self.body_encoding == QP: 264edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return 'quoted-printable' 265edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep elif self.body_encoding == BASE64: 266edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return 'base64' 267edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 268edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return encode_7or8bit 269edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 270edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def convert(self, s): 271edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Convert a string from the input_codec to the output_codec.""" 272edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if self.input_codec != self.output_codec: 273edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return unicode(s, self.input_codec).encode(self.output_codec) 274edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 275edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return s 276edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 277edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def to_splittable(self, s): 278edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Convert a possibly multibyte string to a safely splittable format. 279edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 280edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Uses the input_codec to try and convert the string to Unicode, so it 281edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep can be safely split on character boundaries (even for multibyte 282edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep characters). 283edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 284edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Returns the string as-is if it isn't known how to convert it to 285edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Unicode with the input_charset. 286edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 287edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Characters that could not be converted to Unicode will be replaced 288edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep with the Unicode replacement character U+FFFD. 289edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 290edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if isinstance(s, unicode) or self.input_codec is None: 291edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return s 292edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep try: 293edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return unicode(s, self.input_codec, 'replace') 294edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep except LookupError: 295edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Input codec not installed on system, so return the original 296edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # string unchanged. 297edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return s 298edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 299edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def from_splittable(self, ustr, to_output=True): 300edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Convert a splittable string back into an encoded string. 301edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 302edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Uses the proper codec to try and convert the string from Unicode back 303edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep into an encoded format. Return the string as-is if it is not Unicode, 304edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep or if it could not be converted from Unicode. 305edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 306edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep Characters that could not be converted from Unicode will be replaced 307edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep with an appropriate character (usually '?'). 308edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 309edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep If to_output is True (the default), uses output_codec to convert to an 310edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep encoded format. If to_output is False, uses input_codec. 311edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 312edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if to_output: 313edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep codec = self.output_codec 314edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 315edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep codec = self.input_codec 316edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if not isinstance(ustr, unicode) or codec is None: 317edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return ustr 318edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep try: 319edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return ustr.encode(codec, 'replace') 320edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep except LookupError: 321edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # Output codec not installed 322edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return ustr 323edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 324edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def get_output_charset(self): 325edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Return the output character set. 326edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 327edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep This is self.output_charset if that is not None, otherwise it is 328edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.input_charset. 329edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 330edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return self.output_charset or self.input_charset 331edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 332edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def encoded_header_len(self, s): 333edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Return the length of the encoded header string.""" 334edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep cset = self.get_output_charset() 335edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # The len(s) of a 7bit encoding is len(s) 336edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if self.header_encoding == BASE64: 337edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.base64mime.base64_len(s) + len(cset) + MISC_LEN 338edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep elif self.header_encoding == QP: 339edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN 340edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep elif self.header_encoding == SHORTEST: 341edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep lenb64 = email.base64mime.base64_len(s) 342edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep lenqp = email.quoprimime.header_quopri_len(s) 343edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return min(lenb64, lenqp) + len(cset) + MISC_LEN 344edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 345edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return len(s) 346edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 347edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def header_encode(self, s, convert=False): 348edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Header-encode a string, optionally converting it to output_charset. 349edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 350edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep If convert is True, the string will be converted from the input 351edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep charset to the output charset automatically. This is not useful for 352edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep multibyte character sets, which have line length issues (multibyte 353edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep characters must be split on a character, not a byte boundary); use the 354edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep high-level Header class to deal with these issues. convert defaults 355edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep to False. 356edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 357edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep The type of encoding (base64 or quoted-printable) will be based on 358edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.header_encoding. 359edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 360edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep cset = self.get_output_charset() 361edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if convert: 362edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep s = self.convert(s) 363edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # 7bit/8bit encodings return the string unchanged (modulo conversions) 364edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if self.header_encoding == BASE64: 365edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.base64mime.header_encode(s, cset) 366edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep elif self.header_encoding == QP: 367edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.quoprimime.header_encode(s, cset, maxlinelen=None) 368edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep elif self.header_encoding == SHORTEST: 369edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep lenb64 = email.base64mime.base64_len(s) 370edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep lenqp = email.quoprimime.header_quopri_len(s) 371edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if lenb64 < lenqp: 372edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.base64mime.header_encode(s, cset) 373edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 374edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.quoprimime.header_encode(s, cset, maxlinelen=None) 375edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 376edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return s 377edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 378edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep def body_encode(self, s, convert=True): 379edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """Body-encode a string and convert it to output_charset. 380edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 381edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep If convert is True (the default), the string will be converted from 382edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep the input charset to output charset automatically. Unlike 383edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep header_encode(), there are no issues with byte boundaries and 384edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep multibyte charsets in email bodies, so this is usually pretty safe. 385edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep 386edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep The type of encoding (base64 or quoted-printable) will be based on 387edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep self.body_encoding. 388edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep """ 389edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if convert: 390edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep s = self.convert(s) 391edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep # 7bit/8bit encodings return the string unchanged (module conversions) 392edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep if self.body_encoding is BASE64: 393edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.base64mime.body_encode(s) 394edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep elif self.body_encoding is QP: 395edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return email.quoprimime.body_encode(s) 396edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep else: 397edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep return s 398