1# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py.  This should eventually be rewritten.
7"""
8
9__all__ = [
10    'mktime_tz',
11    'parsedate',
12    'parsedate_tz',
13    'quote',
14    ]
15
16import time, calendar
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24               'aug', 'sep', 'oct', 'nov', 'dec',
25               'january', 'february', 'march', 'april', 'may', 'june', 'july',
26               'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z.  According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones.  RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
38              'EST': -500, 'EDT': -400,  # Eastern
39              'CST': -600, 'CDT': -500,  # Central
40              'MST': -700, 'MDT': -600,  # Mountain
41              'PST': -800, 'PDT': -700   # Pacific
42              }
43
44
45def parsedate_tz(data):
46    """Convert a date string to a time tuple.
47
48    Accounts for military timezones.
49    """
50    res = _parsedate_tz(data)
51    if not res:
52        return
53    if res[9] is None:
54        res[9] = 0
55    return tuple(res)
56
57def _parsedate_tz(data):
58    """Convert date to extended time tuple.
59
60    The last (additional) element is the time zone offset in seconds, except if
61    the timezone was specified as -0000.  In that case the last element is
62    None.  This indicates a UTC timestamp that explicitly declaims knowledge of
63    the source timezone, as opposed to a +0000 timestamp that indicates the
64    source timezone really was UTC.
65
66    """
67    if not data:
68        return
69    data = data.split()
70    # The FWS after the comma after the day-of-week is optional, so search and
71    # adjust for this.
72    if data[0].endswith(',') or data[0].lower() in _daynames:
73        # There's a dayname here. Skip it
74        del data[0]
75    else:
76        i = data[0].rfind(',')
77        if i >= 0:
78            data[0] = data[0][i+1:]
79    if len(data) == 3: # RFC 850 date, deprecated
80        stuff = data[0].split('-')
81        if len(stuff) == 3:
82            data = stuff + data[1:]
83    if len(data) == 4:
84        s = data[3]
85        i = s.find('+')
86        if i == -1:
87            i = s.find('-')
88        if i > 0:
89            data[3:] = [s[:i], s[i:]]
90        else:
91            data.append('') # Dummy tz
92    if len(data) < 5:
93        return None
94    data = data[:5]
95    [dd, mm, yy, tm, tz] = data
96    mm = mm.lower()
97    if mm not in _monthnames:
98        dd, mm = mm, dd.lower()
99        if mm not in _monthnames:
100            return None
101    mm = _monthnames.index(mm) + 1
102    if mm > 12:
103        mm -= 12
104    if dd[-1] == ',':
105        dd = dd[:-1]
106    i = yy.find(':')
107    if i > 0:
108        yy, tm = tm, yy
109    if yy[-1] == ',':
110        yy = yy[:-1]
111    if not yy[0].isdigit():
112        yy, tz = tz, yy
113    if tm[-1] == ',':
114        tm = tm[:-1]
115    tm = tm.split(':')
116    if len(tm) == 2:
117        [thh, tmm] = tm
118        tss = '0'
119    elif len(tm) == 3:
120        [thh, tmm, tss] = tm
121    elif len(tm) == 1 and '.' in tm[0]:
122        # Some non-compliant MUAs use '.' to separate time elements.
123        tm = tm[0].split('.')
124        if len(tm) == 2:
125            [thh, tmm] = tm
126            tss = 0
127        elif len(tm) == 3:
128            [thh, tmm, tss] = tm
129    else:
130        return None
131    try:
132        yy = int(yy)
133        dd = int(dd)
134        thh = int(thh)
135        tmm = int(tmm)
136        tss = int(tss)
137    except ValueError:
138        return None
139    # Check for a yy specified in two-digit format, then convert it to the
140    # appropriate four-digit format, according to the POSIX standard. RFC 822
141    # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
142    # mandates a 4-digit yy. For more information, see the documentation for
143    # the time module.
144    if yy < 100:
145        # The year is between 1969 and 1999 (inclusive).
146        if yy > 68:
147            yy += 1900
148        # The year is between 2000 and 2068 (inclusive).
149        else:
150            yy += 2000
151    tzoffset = None
152    tz = tz.upper()
153    if tz in _timezones:
154        tzoffset = _timezones[tz]
155    else:
156        try:
157            tzoffset = int(tz)
158        except ValueError:
159            pass
160        if tzoffset==0 and tz.startswith('-'):
161            tzoffset = None
162    # Convert a timezone offset into seconds ; -0500 -> -18000
163    if tzoffset:
164        if tzoffset < 0:
165            tzsign = -1
166            tzoffset = -tzoffset
167        else:
168            tzsign = 1
169        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
170    # Daylight Saving Time flag is set to -1, since DST is unknown.
171    return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
172
173
174def parsedate(data):
175    """Convert a time string to a time tuple."""
176    t = parsedate_tz(data)
177    if isinstance(t, tuple):
178        return t[:9]
179    else:
180        return t
181
182
183def mktime_tz(data):
184    """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
185    if data[9] is None:
186        # No zone info, so localtime is better assumption than GMT
187        return time.mktime(data[:8] + (-1,))
188    else:
189        t = calendar.timegm(data)
190        return t - data[9]
191
192
193def quote(str):
194    """Prepare string to be used in a quoted string.
195
196    Turns backslash and double quote characters into quoted pairs.  These
197    are the only characters that need to be quoted inside a quoted string.
198    Does not add the surrounding double quotes.
199    """
200    return str.replace('\\', '\\\\').replace('"', '\\"')
201
202
203class AddrlistClass:
204    """Address parser class by Ben Escoto.
205
206    To understand what this class does, it helps to have a copy of RFC 2822 in
207    front of you.
208
209    Note: this class interface is deprecated and may be removed in the future.
210    Use email.utils.AddressList instead.
211    """
212
213    def __init__(self, field):
214        """Initialize a new instance.
215
216        `field' is an unparsed address header field, containing
217        one or more addresses.
218        """
219        self.specials = '()<>@,:;.\"[]'
220        self.pos = 0
221        self.LWS = ' \t'
222        self.CR = '\r\n'
223        self.FWS = self.LWS + self.CR
224        self.atomends = self.specials + self.LWS + self.CR
225        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
226        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
227        # syntax, so allow dots in phrases.
228        self.phraseends = self.atomends.replace('.', '')
229        self.field = field
230        self.commentlist = []
231
232    def gotonext(self):
233        """Skip white space and extract comments."""
234        wslist = []
235        while self.pos < len(self.field):
236            if self.field[self.pos] in self.LWS + '\n\r':
237                if self.field[self.pos] not in '\n\r':
238                    wslist.append(self.field[self.pos])
239                self.pos += 1
240            elif self.field[self.pos] == '(':
241                self.commentlist.append(self.getcomment())
242            else:
243                break
244        return EMPTYSTRING.join(wslist)
245
246    def getaddrlist(self):
247        """Parse all addresses.
248
249        Returns a list containing all of the addresses.
250        """
251        result = []
252        while self.pos < len(self.field):
253            ad = self.getaddress()
254            if ad:
255                result += ad
256            else:
257                result.append(('', ''))
258        return result
259
260    def getaddress(self):
261        """Parse the next address."""
262        self.commentlist = []
263        self.gotonext()
264
265        oldpos = self.pos
266        oldcl = self.commentlist
267        plist = self.getphraselist()
268
269        self.gotonext()
270        returnlist = []
271
272        if self.pos >= len(self.field):
273            # Bad email address technically, no domain.
274            if plist:
275                returnlist = [(SPACE.join(self.commentlist), plist[0])]
276
277        elif self.field[self.pos] in '.@':
278            # email address is just an addrspec
279            # this isn't very efficient since we start over
280            self.pos = oldpos
281            self.commentlist = oldcl
282            addrspec = self.getaddrspec()
283            returnlist = [(SPACE.join(self.commentlist), addrspec)]
284
285        elif self.field[self.pos] == ':':
286            # address is a group
287            returnlist = []
288
289            fieldlen = len(self.field)
290            self.pos += 1
291            while self.pos < len(self.field):
292                self.gotonext()
293                if self.pos < fieldlen and self.field[self.pos] == ';':
294                    self.pos += 1
295                    break
296                returnlist = returnlist + self.getaddress()
297
298        elif self.field[self.pos] == '<':
299            # Address is a phrase then a route addr
300            routeaddr = self.getrouteaddr()
301
302            if self.commentlist:
303                returnlist = [(SPACE.join(plist) + ' (' +
304                               ' '.join(self.commentlist) + ')', routeaddr)]
305            else:
306                returnlist = [(SPACE.join(plist), routeaddr)]
307
308        else:
309            if plist:
310                returnlist = [(SPACE.join(self.commentlist), plist[0])]
311            elif self.field[self.pos] in self.specials:
312                self.pos += 1
313
314        self.gotonext()
315        if self.pos < len(self.field) and self.field[self.pos] == ',':
316            self.pos += 1
317        return returnlist
318
319    def getrouteaddr(self):
320        """Parse a route address (Return-path value).
321
322        This method just skips all the route stuff and returns the addrspec.
323        """
324        if self.field[self.pos] != '<':
325            return
326
327        expectroute = False
328        self.pos += 1
329        self.gotonext()
330        adlist = ''
331        while self.pos < len(self.field):
332            if expectroute:
333                self.getdomain()
334                expectroute = False
335            elif self.field[self.pos] == '>':
336                self.pos += 1
337                break
338            elif self.field[self.pos] == '@':
339                self.pos += 1
340                expectroute = True
341            elif self.field[self.pos] == ':':
342                self.pos += 1
343            else:
344                adlist = self.getaddrspec()
345                self.pos += 1
346                break
347            self.gotonext()
348
349        return adlist
350
351    def getaddrspec(self):
352        """Parse an RFC 2822 addr-spec."""
353        aslist = []
354
355        self.gotonext()
356        while self.pos < len(self.field):
357            preserve_ws = True
358            if self.field[self.pos] == '.':
359                if aslist and not aslist[-1].strip():
360                    aslist.pop()
361                aslist.append('.')
362                self.pos += 1
363                preserve_ws = False
364            elif self.field[self.pos] == '"':
365                aslist.append('"%s"' % quote(self.getquote()))
366            elif self.field[self.pos] in self.atomends:
367                if aslist and not aslist[-1].strip():
368                    aslist.pop()
369                break
370            else:
371                aslist.append(self.getatom())
372            ws = self.gotonext()
373            if preserve_ws and ws:
374                aslist.append(ws)
375
376        if self.pos >= len(self.field) or self.field[self.pos] != '@':
377            return EMPTYSTRING.join(aslist)
378
379        aslist.append('@')
380        self.pos += 1
381        self.gotonext()
382        return EMPTYSTRING.join(aslist) + self.getdomain()
383
384    def getdomain(self):
385        """Get the complete domain name from an address."""
386        sdlist = []
387        while self.pos < len(self.field):
388            if self.field[self.pos] in self.LWS:
389                self.pos += 1
390            elif self.field[self.pos] == '(':
391                self.commentlist.append(self.getcomment())
392            elif self.field[self.pos] == '[':
393                sdlist.append(self.getdomainliteral())
394            elif self.field[self.pos] == '.':
395                self.pos += 1
396                sdlist.append('.')
397            elif self.field[self.pos] in self.atomends:
398                break
399            else:
400                sdlist.append(self.getatom())
401        return EMPTYSTRING.join(sdlist)
402
403    def getdelimited(self, beginchar, endchars, allowcomments=True):
404        """Parse a header fragment delimited by special characters.
405
406        `beginchar' is the start character for the fragment.
407        If self is not looking at an instance of `beginchar' then
408        getdelimited returns the empty string.
409
410        `endchars' is a sequence of allowable end-delimiting characters.
411        Parsing stops when one of these is encountered.
412
413        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
414        within the parsed fragment.
415        """
416        if self.field[self.pos] != beginchar:
417            return ''
418
419        slist = ['']
420        quote = False
421        self.pos += 1
422        while self.pos < len(self.field):
423            if quote:
424                slist.append(self.field[self.pos])
425                quote = False
426            elif self.field[self.pos] in endchars:
427                self.pos += 1
428                break
429            elif allowcomments and self.field[self.pos] == '(':
430                slist.append(self.getcomment())
431                continue        # have already advanced pos from getcomment
432            elif self.field[self.pos] == '\\':
433                quote = True
434            else:
435                slist.append(self.field[self.pos])
436            self.pos += 1
437
438        return EMPTYSTRING.join(slist)
439
440    def getquote(self):
441        """Get a quote-delimited fragment from self's field."""
442        return self.getdelimited('"', '"\r', False)
443
444    def getcomment(self):
445        """Get a parenthesis-delimited fragment from self's field."""
446        return self.getdelimited('(', ')\r', True)
447
448    def getdomainliteral(self):
449        """Parse an RFC 2822 domain-literal."""
450        return '[%s]' % self.getdelimited('[', ']\r', False)
451
452    def getatom(self, atomends=None):
453        """Parse an RFC 2822 atom.
454
455        Optional atomends specifies a different set of end token delimiters
456        (the default is to use self.atomends).  This is used e.g. in
457        getphraselist() since phrase endings must not include the `.' (which
458        is legal in phrases)."""
459        atomlist = ['']
460        if atomends is None:
461            atomends = self.atomends
462
463        while self.pos < len(self.field):
464            if self.field[self.pos] in atomends:
465                break
466            else:
467                atomlist.append(self.field[self.pos])
468            self.pos += 1
469
470        return EMPTYSTRING.join(atomlist)
471
472    def getphraselist(self):
473        """Parse a sequence of RFC 2822 phrases.
474
475        A phrase is a sequence of words, which are in turn either RFC 2822
476        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
477        runs of continuous whitespace into one space.
478        """
479        plist = []
480
481        while self.pos < len(self.field):
482            if self.field[self.pos] in self.FWS:
483                self.pos += 1
484            elif self.field[self.pos] == '"':
485                plist.append(self.getquote())
486            elif self.field[self.pos] == '(':
487                self.commentlist.append(self.getcomment())
488            elif self.field[self.pos] in self.phraseends:
489                break
490            else:
491                plist.append(self.getatom(self.phraseends))
492
493        return plist
494
495class AddressList(AddrlistClass):
496    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
497    def __init__(self, field):
498        AddrlistClass.__init__(self, field)
499        if field:
500            self.addresslist = self.getaddrlist()
501        else:
502            self.addresslist = []
503
504    def __len__(self):
505        return len(self.addresslist)
506
507    def __add__(self, other):
508        # Set union
509        newaddr = AddressList(None)
510        newaddr.addresslist = self.addresslist[:]
511        for x in other.addresslist:
512            if not x in self.addresslist:
513                newaddr.addresslist.append(x)
514        return newaddr
515
516    def __iadd__(self, other):
517        # Set union, in-place
518        for x in other.addresslist:
519            if not x in self.addresslist:
520                self.addresslist.append(x)
521        return self
522
523    def __sub__(self, other):
524        # Set difference
525        newaddr = AddressList(None)
526        for x in self.addresslist:
527            if not x in other.addresslist:
528                newaddr.addresslist.append(x)
529        return newaddr
530
531    def __isub__(self, other):
532        # Set difference, in-place
533        for x in other.addresslist:
534            if x in self.addresslist:
535                self.addresslist.remove(x)
536        return self
537
538    def __getitem__(self, index):
539        # Make indexing, slices, and 'in' work
540        return self.addresslist[index]
541