1edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Copyright (C) 2001-2006 Python Software Foundation
2edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Author: Ben Gertzfield, Barry Warsaw
3edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Contact: email-sig@python.org
4edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
5edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep__all__ = [
6edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'Charset',
7edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'add_alias',
8edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'add_charset',
9edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'add_codec',
10edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    ]
11edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
12edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepimport codecs
13edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepimport email.base64mime
14edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepimport email.quoprimime
15edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
16edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepfrom email import errors
17edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepfrom email.encoders import encode_7or8bit
18edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
19edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
20edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
21edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Flags for types of header encodings
22edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepQP          = 1 # Quoted-Printable
23edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepBASE64      = 2 # Base64
24edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepSHORTEST    = 3 # the shorter of QP and base64, but only for headers
25edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
26edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
27edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepMISC_LEN = 7
28edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
29edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepDEFAULT_CHARSET = 'us-ascii'
30edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
31edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
32edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
33edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Defaults
34edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepCHARSETS = {
35edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # input        header enc  body enc output conv
36edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-1':  (QP,        QP,      None),
37edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-2':  (QP,        QP,      None),
38edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-3':  (QP,        QP,      None),
39edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-4':  (QP,        QP,      None),
40edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # iso-8859-5 is Cyrillic, and not especially used
41edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # iso-8859-6 is Arabic, also not particularly used
42edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # iso-8859-7 is Greek, QP will not make it readable
43edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # iso-8859-8 is Hebrew, QP will not make it readable
44edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-9':  (QP,        QP,      None),
45edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-10': (QP,        QP,      None),
46edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # iso-8859-11 is Thai, QP will not make it readable
47edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-13': (QP,        QP,      None),
48edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-14': (QP,        QP,      None),
49edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-15': (QP,        QP,      None),
50edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-8859-16': (QP,        QP,      None),
51edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'windows-1252':(QP,        QP,      None),
52edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'viscii':      (QP,        QP,      None),
53edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'us-ascii':    (None,      None,    None),
54edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'big5':        (BASE64,    BASE64,  None),
55edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'gb2312':      (BASE64,    BASE64,  None),
56edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
57edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
58edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'iso-2022-jp': (BASE64,    None,    None),
59edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'koi8-r':      (BASE64,    BASE64,  None),
60edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
61edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # We're making this one up to represent raw unencoded 8-bit
62edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    '8bit':        (None,      BASE64, 'utf-8'),
63edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    }
64edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
65edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Aliases for other commonly-used names for character sets.  Map
66edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# them to the real ones used in email.
67edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepALIASES = {
68edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_1': 'iso-8859-1',
69edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-1': 'iso-8859-1',
70edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_2': 'iso-8859-2',
71edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-2': 'iso-8859-2',
72edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_3': 'iso-8859-3',
73edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-3': 'iso-8859-3',
74edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_4': 'iso-8859-4',
75edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-4': 'iso-8859-4',
76edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_5': 'iso-8859-9',
77edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-5': 'iso-8859-9',
78edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_6': 'iso-8859-10',
79edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-6': 'iso-8859-10',
80edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_7': 'iso-8859-13',
81edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-7': 'iso-8859-13',
82edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_8': 'iso-8859-14',
83edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-8': 'iso-8859-14',
84edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_9': 'iso-8859-15',
85edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-9': 'iso-8859-15',
86edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin_10':'iso-8859-16',
87edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'latin-10':'iso-8859-16',
88edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'cp949':   'ks_c_5601-1987',
89edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'euc_jp':  'euc-jp',
90edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'euc_kr':  'euc-kr',
91edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'ascii':   'us-ascii',
92edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    }
93edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
94edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
95edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Map charsets to their Unicode codec strings.
96edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander StoepCODEC_MAP = {
97edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'gb2312':      'eucgb2312_cn',
98edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'big5':        'big5_tw',
99edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
100edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
101edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    # Let that stuff pass through without conversion to/from Unicode.
102edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    'us-ascii':    None,
103edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    }
104edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
105edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
106edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
107edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep# Convenience functions for extending the above mappings
108edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
109edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """Add character set properties to the global registry.
110edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
111edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    charset is the input character set, and must be the canonical name of a
112edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    character set.
113edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
114edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    Optional header_enc and body_enc is either Charset.QP for
115edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
116edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
117edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    is only valid for header_enc.  It describes how message headers and
118edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    message bodies in the input charset are to be encoded.  Default is no
119edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    encoding.
120edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
121edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    Optional output_charset is the character set that the output should be
122edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    in.  Conversions will proceed from input charset, to Unicode, to the
123edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    output charset when the method Charset.convert() is called.  The default
124edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    is to output in the same character set as the input.
125edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
126edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    Both input_charset and output_charset must have Unicode codec entries in
127edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    the module's charset-to-codec mapping; use add_codec(charset, codecname)
128edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    to add codecs the module does not know about.  See the codecs module's
129edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    documentation for more information.
130edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """
131edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    if body_enc == SHORTEST:
132edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        raise ValueError('SHORTEST not allowed for body_enc')
133edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    CHARSETS[charset] = (header_enc, body_enc, output_charset)
134edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
135edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
136edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepdef add_alias(alias, canonical):
137edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """Add a character set alias.
138edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
139edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    alias is the alias name, e.g. latin-1
140edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    canonical is the character set's canonical name, e.g. iso-8859-1
141edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """
142edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    ALIASES[alias] = canonical
143edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
144edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
145edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepdef add_codec(charset, codecname):
146edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """Add a codec that map characters in the given charset to/from Unicode.
147edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
148edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    charset is the canonical name of a character set.  codecname is the name
149edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    of a Python codec, as appropriate for the second argument to the unicode()
150edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    built-in, or to the encode() method of a Unicode string.
151edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """
152edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    CODEC_MAP[charset] = codecname
153edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
154edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
155edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
156edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoepclass Charset:
157edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """Map character sets to their email properties.
158edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
159edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    This class provides information about the requirements imposed on email
160edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    for a specific character set.  It also provides convenience routines for
161edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    converting between character sets, given the availability of the
162edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    applicable codecs.  Given a character set, it will do its best to provide
163edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    information on how to use that character set in an email in an
164edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    RFC-compliant way.
165edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
166edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    Certain character sets must be encoded with quoted-printable or base64
167edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    when used in email headers or bodies.  Certain character sets must be
168edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    converted outright, and are not allowed in email.  Instances of this
169edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    module expose the following information about a character set:
170edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
171edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    input_charset: The initial character set specified.  Common aliases
172edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                   are converted to their `official' email names (e.g. latin_1
173edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                   is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
174edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
175edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    header_encoding: If the character set must be encoded before it can be
176edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                     used in an email header, this attribute will be set to
177edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                     Charset.QP (for quoted-printable), Charset.BASE64 (for
178edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                     base64 encoding), or Charset.SHORTEST for the shortest of
179edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                     QP or BASE64 encoding.  Otherwise, it will be None.
180edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
181edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    body_encoding: Same as header_encoding, but describes the encoding for the
182edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                   mail message's body, which indeed may be different than the
183edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                   header encoding.  Charset.SHORTEST is not allowed for
184edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                   body_encoding.
185edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
186edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    output_charset: Some character sets must be converted before the can be
187edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                    used in email headers or bodies.  If the input_charset is
188edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                    one of them, this attribute will contain the name of the
189edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                    charset output will be converted to.  Otherwise, it will
190edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                    be None.
191edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
192edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    input_codec: The name of the Python codec used to convert the
193edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                 input_charset to Unicode.  If no conversion codec is
194edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                 necessary, this attribute will be None.
195edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
196edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    output_codec: The name of the Python codec used to convert Unicode
197edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                  to the output_charset.  If no conversion codec is necessary,
198edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                  this attribute will have the same value as the input_codec.
199edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    """
200edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def __init__(self, input_charset=DEFAULT_CHARSET):
201edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
202edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # unicode because its .lower() is locale insensitive.  If the argument
203edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # is already a unicode, we leave it at that, but ensure that the
204edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # charset is ASCII, as the standard (RFC XXX) requires.
205edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        try:
206edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            if isinstance(input_charset, unicode):
207edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                input_charset.encode('ascii')
208edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            else:
209edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                input_charset = unicode(input_charset, 'ascii')
210edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        except UnicodeError:
211edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            raise errors.CharsetError(input_charset)
212edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        input_charset = input_charset.lower().encode('ascii')
213edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # Set the input charset after filtering through the aliases and/or codecs
214edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if not (input_charset in ALIASES or input_charset in CHARSETS):
215edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            try:
216edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                input_charset = codecs.lookup(input_charset).name
217edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            except LookupError:
218edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                pass
219edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.input_charset = ALIASES.get(input_charset, input_charset)
220edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # We can try to guess which encoding and conversion to use by the
221edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # charset_map dictionary.  Try that first, but let the user override
222edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # it.
223edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        henc, benc, conv = CHARSETS.get(self.input_charset,
224edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                                        (SHORTEST, BASE64, None))
225edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if not conv:
226edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            conv = self.input_charset
227edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # Set the attributes, allowing the arguments to override the default.
228edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.header_encoding = henc
229edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.body_encoding = benc
230edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.output_charset = ALIASES.get(conv, conv)
231edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # Now set the codecs.  If one isn't defined for input_charset,
232edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # guess and try a Unicode codec with the same name as input_codec.
233edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.input_codec = CODEC_MAP.get(self.input_charset,
234edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                                         self.input_charset)
235edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.output_codec = CODEC_MAP.get(self.output_charset,
236edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                                          self.output_charset)
237edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
238edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def __str__(self):
239edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        return self.input_charset.lower()
240edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
241edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    __repr__ = __str__
242edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
243edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def __eq__(self, other):
244edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        return str(self) == str(other).lower()
245edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
246edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def __ne__(self, other):
247edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        return not self.__eq__(other)
248edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
249edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def get_body_encoding(self):
250edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Return the content-transfer-encoding used for body encoding.
251edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
252edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        This is either the string `quoted-printable' or `base64' depending on
253edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        the encoding used, or it is a function in which case you should call
254edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        the function with a single argument, the Message object being
255edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        encoded.  The function should then set the Content-Transfer-Encoding
256edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        header itself to whatever is appropriate.
257edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
258edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Returns "quoted-printable" if self.body_encoding is QP.
259edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Returns "base64" if self.body_encoding is BASE64.
260edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Returns "7bit" otherwise.
261edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """
262edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        assert self.body_encoding != SHORTEST
263edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if self.body_encoding == QP:
264edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return 'quoted-printable'
265edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        elif self.body_encoding == BASE64:
266edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return 'base64'
267edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        else:
268edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return encode_7or8bit
269edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
270edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def convert(self, s):
271edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Convert a string from the input_codec to the output_codec."""
272edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if self.input_codec != self.output_codec:
273edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return unicode(s, self.input_codec).encode(self.output_codec)
274edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        else:
275edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return s
276edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
277edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def to_splittable(self, s):
278edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Convert a possibly multibyte string to a safely splittable format.
279edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
280edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Uses the input_codec to try and convert the string to Unicode, so it
281edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        can be safely split on character boundaries (even for multibyte
282edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        characters).
283edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
284edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Returns the string as-is if it isn't known how to convert it to
285edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Unicode with the input_charset.
286edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
287edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Characters that could not be converted to Unicode will be replaced
288edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        with the Unicode replacement character U+FFFD.
289edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """
290edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if isinstance(s, unicode) or self.input_codec is None:
291edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return s
292edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        try:
293edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return unicode(s, self.input_codec, 'replace')
294edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        except LookupError:
295edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            # Input codec not installed on system, so return the original
296edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            # string unchanged.
297edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return s
298edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
299edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def from_splittable(self, ustr, to_output=True):
300edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Convert a splittable string back into an encoded string.
301edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
302edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Uses the proper codec to try and convert the string from Unicode back
303edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        into an encoded format.  Return the string as-is if it is not Unicode,
304edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        or if it could not be converted from Unicode.
305edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
306edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        Characters that could not be converted from Unicode will be replaced
307edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        with an appropriate character (usually '?').
308edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
309edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        If to_output is True (the default), uses output_codec to convert to an
310edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        encoded format.  If to_output is False, uses input_codec.
311edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """
312edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if to_output:
313edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            codec = self.output_codec
314edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        else:
315edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            codec = self.input_codec
316edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if not isinstance(ustr, unicode) or codec is None:
317edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return ustr
318edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        try:
319edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return ustr.encode(codec, 'replace')
320edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        except LookupError:
321edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            # Output codec not installed
322edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return ustr
323edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
324edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def get_output_charset(self):
325edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Return the output character set.
326edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
327edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        This is self.output_charset if that is not None, otherwise it is
328edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.input_charset.
329edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """
330edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        return self.output_charset or self.input_charset
331edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
332edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def encoded_header_len(self, s):
333edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Return the length of the encoded header string."""
334edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        cset = self.get_output_charset()
335edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # The len(s) of a 7bit encoding is len(s)
336edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if self.header_encoding == BASE64:
337edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return email.base64mime.base64_len(s) + len(cset) + MISC_LEN
338edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        elif self.header_encoding == QP:
339edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN
340edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        elif self.header_encoding == SHORTEST:
341edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            lenb64 = email.base64mime.base64_len(s)
342edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            lenqp = email.quoprimime.header_quopri_len(s)
343edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return min(lenb64, lenqp) + len(cset) + MISC_LEN
344edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        else:
345edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return len(s)
346edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
347edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def header_encode(self, s, convert=False):
348edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Header-encode a string, optionally converting it to output_charset.
349edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
350edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        If convert is True, the string will be converted from the input
351edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        charset to the output charset automatically.  This is not useful for
352edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        multibyte character sets, which have line length issues (multibyte
353edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        characters must be split on a character, not a byte boundary); use the
354edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        high-level Header class to deal with these issues.  convert defaults
355edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        to False.
356edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
357edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        The type of encoding (base64 or quoted-printable) will be based on
358edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.header_encoding.
359edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """
360edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        cset = self.get_output_charset()
361edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if convert:
362edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            s = self.convert(s)
363edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # 7bit/8bit encodings return the string unchanged (modulo conversions)
364edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if self.header_encoding == BASE64:
365edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return email.base64mime.header_encode(s, cset)
366edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        elif self.header_encoding == QP:
367edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return email.quoprimime.header_encode(s, cset, maxlinelen=None)
368edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        elif self.header_encoding == SHORTEST:
369edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            lenb64 = email.base64mime.base64_len(s)
370edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            lenqp = email.quoprimime.header_quopri_len(s)
371edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            if lenb64 < lenqp:
372edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                return email.base64mime.header_encode(s, cset)
373edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            else:
374edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep                return email.quoprimime.header_encode(s, cset, maxlinelen=None)
375edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        else:
376edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return s
377edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
378edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep    def body_encode(self, s, convert=True):
379edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """Body-encode a string and convert it to output_charset.
380edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
381edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        If convert is True (the default), the string will be converted from
382edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        the input charset to output charset automatically.  Unlike
383edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        header_encode(), there are no issues with byte boundaries and
384edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        multibyte charsets in email bodies, so this is usually pretty safe.
385edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep
386edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        The type of encoding (base64 or quoted-printable) will be based on
387edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        self.body_encoding.
388edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        """
389edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if convert:
390edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            s = self.convert(s)
391edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        # 7bit/8bit encodings return the string unchanged (module conversions)
392edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        if self.body_encoding is BASE64:
393edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return email.base64mime.body_encode(s)
394edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        elif self.body_encoding is QP:
395edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return email.quoprimime.body_encode(s)
396edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep        else:
397edbb763a2b63074cd468a5d33a17908b2cc0654Jeff Vander Stoep            return s
398