1# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py.  This should eventually be rewritten.
7"""
8
9__all__ = [
10    'mktime_tz',
11    'parsedate',
12    'parsedate_tz',
13    'quote',
14    ]
15
16import time, calendar
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24               'aug', 'sep', 'oct', 'nov', 'dec',
25               'january', 'february', 'march', 'april', 'may', 'june', 'july',
26               'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z.  According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones.  RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
38              'EST': -500, 'EDT': -400,  # Eastern
39              'CST': -600, 'CDT': -500,  # Central
40              'MST': -700, 'MDT': -600,  # Mountain
41              'PST': -800, 'PDT': -700   # Pacific
42              }
43
44
45def parsedate_tz(data):
46    """Convert a date string to a time tuple.
47
48    Accounts for military timezones.
49    """
50    data = data.split()
51    # The FWS after the comma after the day-of-week is optional, so search and
52    # adjust for this.
53    if data[0].endswith(',') or data[0].lower() in _daynames:
54        # There's a dayname here. Skip it
55        del data[0]
56    else:
57        i = data[0].rfind(',')
58        if i >= 0:
59            data[0] = data[0][i+1:]
60    if len(data) == 3: # RFC 850 date, deprecated
61        stuff = data[0].split('-')
62        if len(stuff) == 3:
63            data = stuff + data[1:]
64    if len(data) == 4:
65        s = data[3]
66        i = s.find('+')
67        if i > 0:
68            data[3:] = [s[:i], s[i+1:]]
69        else:
70            data.append('') # Dummy tz
71    if len(data) < 5:
72        return None
73    data = data[:5]
74    [dd, mm, yy, tm, tz] = data
75    mm = mm.lower()
76    if mm not in _monthnames:
77        dd, mm = mm, dd.lower()
78        if mm not in _monthnames:
79            return None
80    mm = _monthnames.index(mm) + 1
81    if mm > 12:
82        mm -= 12
83    if dd[-1] == ',':
84        dd = dd[:-1]
85    i = yy.find(':')
86    if i > 0:
87        yy, tm = tm, yy
88    if yy[-1] == ',':
89        yy = yy[:-1]
90    if not yy[0].isdigit():
91        yy, tz = tz, yy
92    if tm[-1] == ',':
93        tm = tm[:-1]
94    tm = tm.split(':')
95    if len(tm) == 2:
96        [thh, tmm] = tm
97        tss = '0'
98    elif len(tm) == 3:
99        [thh, tmm, tss] = tm
100    else:
101        return None
102    try:
103        yy = int(yy)
104        dd = int(dd)
105        thh = int(thh)
106        tmm = int(tmm)
107        tss = int(tss)
108    except ValueError:
109        return None
110    # Check for a yy specified in two-digit format, then convert it to the
111    # appropriate four-digit format, according to the POSIX standard. RFC 822
112    # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
113    # mandates a 4-digit yy. For more information, see the documentation for
114    # the time module.
115    if yy < 100:
116        # The year is between 1969 and 1999 (inclusive).
117        if yy > 68:
118            yy += 1900
119        # The year is between 2000 and 2068 (inclusive).
120        else:
121            yy += 2000
122    tzoffset = None
123    tz = tz.upper()
124    if tz in _timezones:
125        tzoffset = _timezones[tz]
126    else:
127        try:
128            tzoffset = int(tz)
129        except ValueError:
130            pass
131    # Convert a timezone offset into seconds ; -0500 -> -18000
132    if tzoffset:
133        if tzoffset < 0:
134            tzsign = -1
135            tzoffset = -tzoffset
136        else:
137            tzsign = 1
138        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
139    # Daylight Saving Time flag is set to -1, since DST is unknown.
140    return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
141
142
143def parsedate(data):
144    """Convert a time string to a time tuple."""
145    t = parsedate_tz(data)
146    if isinstance(t, tuple):
147        return t[:9]
148    else:
149        return t
150
151
152def mktime_tz(data):
153    """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
154    if data[9] is None:
155        # No zone info, so localtime is better assumption than GMT
156        return time.mktime(data[:8] + (-1,))
157    else:
158        t = calendar.timegm(data)
159        return t - data[9]
160
161
162def quote(str):
163    """Prepare string to be used in a quoted string.
164
165    Turns backslash and double quote characters into quoted pairs.  These
166    are the only characters that need to be quoted inside a quoted string.
167    Does not add the surrounding double quotes.
168    """
169    return str.replace('\\', '\\\\').replace('"', '\\"')
170
171
172class AddrlistClass:
173    """Address parser class by Ben Escoto.
174
175    To understand what this class does, it helps to have a copy of RFC 2822 in
176    front of you.
177
178    Note: this class interface is deprecated and may be removed in the future.
179    Use rfc822.AddressList instead.
180    """
181
182    def __init__(self, field):
183        """Initialize a new instance.
184
185        `field' is an unparsed address header field, containing
186        one or more addresses.
187        """
188        self.specials = '()<>@,:;.\"[]'
189        self.pos = 0
190        self.LWS = ' \t'
191        self.CR = '\r\n'
192        self.FWS = self.LWS + self.CR
193        self.atomends = self.specials + self.LWS + self.CR
194        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
195        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
196        # syntax, so allow dots in phrases.
197        self.phraseends = self.atomends.replace('.', '')
198        self.field = field
199        self.commentlist = []
200
201    def gotonext(self):
202        """Parse up to the start of the next address."""
203        while self.pos < len(self.field):
204            if self.field[self.pos] in self.LWS + '\n\r':
205                self.pos += 1
206            elif self.field[self.pos] == '(':
207                self.commentlist.append(self.getcomment())
208            else:
209                break
210
211    def getaddrlist(self):
212        """Parse all addresses.
213
214        Returns a list containing all of the addresses.
215        """
216        result = []
217        while self.pos < len(self.field):
218            ad = self.getaddress()
219            if ad:
220                result += ad
221            else:
222                result.append(('', ''))
223        return result
224
225    def getaddress(self):
226        """Parse the next address."""
227        self.commentlist = []
228        self.gotonext()
229
230        oldpos = self.pos
231        oldcl = self.commentlist
232        plist = self.getphraselist()
233
234        self.gotonext()
235        returnlist = []
236
237        if self.pos >= len(self.field):
238            # Bad email address technically, no domain.
239            if plist:
240                returnlist = [(SPACE.join(self.commentlist), plist[0])]
241
242        elif self.field[self.pos] in '.@':
243            # email address is just an addrspec
244            # this isn't very efficient since we start over
245            self.pos = oldpos
246            self.commentlist = oldcl
247            addrspec = self.getaddrspec()
248            returnlist = [(SPACE.join(self.commentlist), addrspec)]
249
250        elif self.field[self.pos] == ':':
251            # address is a group
252            returnlist = []
253
254            fieldlen = len(self.field)
255            self.pos += 1
256            while self.pos < len(self.field):
257                self.gotonext()
258                if self.pos < fieldlen and self.field[self.pos] == ';':
259                    self.pos += 1
260                    break
261                returnlist = returnlist + self.getaddress()
262
263        elif self.field[self.pos] == '<':
264            # Address is a phrase then a route addr
265            routeaddr = self.getrouteaddr()
266
267            if self.commentlist:
268                returnlist = [(SPACE.join(plist) + ' (' +
269                               ' '.join(self.commentlist) + ')', routeaddr)]
270            else:
271                returnlist = [(SPACE.join(plist), routeaddr)]
272
273        else:
274            if plist:
275                returnlist = [(SPACE.join(self.commentlist), plist[0])]
276            elif self.field[self.pos] in self.specials:
277                self.pos += 1
278
279        self.gotonext()
280        if self.pos < len(self.field) and self.field[self.pos] == ',':
281            self.pos += 1
282        return returnlist
283
284    def getrouteaddr(self):
285        """Parse a route address (Return-path value).
286
287        This method just skips all the route stuff and returns the addrspec.
288        """
289        if self.field[self.pos] != '<':
290            return
291
292        expectroute = False
293        self.pos += 1
294        self.gotonext()
295        adlist = ''
296        while self.pos < len(self.field):
297            if expectroute:
298                self.getdomain()
299                expectroute = False
300            elif self.field[self.pos] == '>':
301                self.pos += 1
302                break
303            elif self.field[self.pos] == '@':
304                self.pos += 1
305                expectroute = True
306            elif self.field[self.pos] == ':':
307                self.pos += 1
308            else:
309                adlist = self.getaddrspec()
310                self.pos += 1
311                break
312            self.gotonext()
313
314        return adlist
315
316    def getaddrspec(self):
317        """Parse an RFC 2822 addr-spec."""
318        aslist = []
319
320        self.gotonext()
321        while self.pos < len(self.field):
322            if self.field[self.pos] == '.':
323                aslist.append('.')
324                self.pos += 1
325            elif self.field[self.pos] == '"':
326                aslist.append('"%s"' % quote(self.getquote()))
327            elif self.field[self.pos] in self.atomends:
328                break
329            else:
330                aslist.append(self.getatom())
331            self.gotonext()
332
333        if self.pos >= len(self.field) or self.field[self.pos] != '@':
334            return EMPTYSTRING.join(aslist)
335
336        aslist.append('@')
337        self.pos += 1
338        self.gotonext()
339        return EMPTYSTRING.join(aslist) + self.getdomain()
340
341    def getdomain(self):
342        """Get the complete domain name from an address."""
343        sdlist = []
344        while self.pos < len(self.field):
345            if self.field[self.pos] in self.LWS:
346                self.pos += 1
347            elif self.field[self.pos] == '(':
348                self.commentlist.append(self.getcomment())
349            elif self.field[self.pos] == '[':
350                sdlist.append(self.getdomainliteral())
351            elif self.field[self.pos] == '.':
352                self.pos += 1
353                sdlist.append('.')
354            elif self.field[self.pos] in self.atomends:
355                break
356            else:
357                sdlist.append(self.getatom())
358        return EMPTYSTRING.join(sdlist)
359
360    def getdelimited(self, beginchar, endchars, allowcomments=True):
361        """Parse a header fragment delimited by special characters.
362
363        `beginchar' is the start character for the fragment.
364        If self is not looking at an instance of `beginchar' then
365        getdelimited returns the empty string.
366
367        `endchars' is a sequence of allowable end-delimiting characters.
368        Parsing stops when one of these is encountered.
369
370        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
371        within the parsed fragment.
372        """
373        if self.field[self.pos] != beginchar:
374            return ''
375
376        slist = ['']
377        quote = False
378        self.pos += 1
379        while self.pos < len(self.field):
380            if quote:
381                slist.append(self.field[self.pos])
382                quote = False
383            elif self.field[self.pos] in endchars:
384                self.pos += 1
385                break
386            elif allowcomments and self.field[self.pos] == '(':
387                slist.append(self.getcomment())
388                continue        # have already advanced pos from getcomment
389            elif self.field[self.pos] == '\\':
390                quote = True
391            else:
392                slist.append(self.field[self.pos])
393            self.pos += 1
394
395        return EMPTYSTRING.join(slist)
396
397    def getquote(self):
398        """Get a quote-delimited fragment from self's field."""
399        return self.getdelimited('"', '"\r', False)
400
401    def getcomment(self):
402        """Get a parenthesis-delimited fragment from self's field."""
403        return self.getdelimited('(', ')\r', True)
404
405    def getdomainliteral(self):
406        """Parse an RFC 2822 domain-literal."""
407        return '[%s]' % self.getdelimited('[', ']\r', False)
408
409    def getatom(self, atomends=None):
410        """Parse an RFC 2822 atom.
411
412        Optional atomends specifies a different set of end token delimiters
413        (the default is to use self.atomends).  This is used e.g. in
414        getphraselist() since phrase endings must not include the `.' (which
415        is legal in phrases)."""
416        atomlist = ['']
417        if atomends is None:
418            atomends = self.atomends
419
420        while self.pos < len(self.field):
421            if self.field[self.pos] in atomends:
422                break
423            else:
424                atomlist.append(self.field[self.pos])
425            self.pos += 1
426
427        return EMPTYSTRING.join(atomlist)
428
429    def getphraselist(self):
430        """Parse a sequence of RFC 2822 phrases.
431
432        A phrase is a sequence of words, which are in turn either RFC 2822
433        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
434        runs of continuous whitespace into one space.
435        """
436        plist = []
437
438        while self.pos < len(self.field):
439            if self.field[self.pos] in self.FWS:
440                self.pos += 1
441            elif self.field[self.pos] == '"':
442                plist.append(self.getquote())
443            elif self.field[self.pos] == '(':
444                self.commentlist.append(self.getcomment())
445            elif self.field[self.pos] in self.phraseends:
446                break
447            else:
448                plist.append(self.getatom(self.phraseends))
449
450        return plist
451
452class AddressList(AddrlistClass):
453    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
454    def __init__(self, field):
455        AddrlistClass.__init__(self, field)
456        if field:
457            self.addresslist = self.getaddrlist()
458        else:
459            self.addresslist = []
460
461    def __len__(self):
462        return len(self.addresslist)
463
464    def __add__(self, other):
465        # Set union
466        newaddr = AddressList(None)
467        newaddr.addresslist = self.addresslist[:]
468        for x in other.addresslist:
469            if not x in self.addresslist:
470                newaddr.addresslist.append(x)
471        return newaddr
472
473    def __iadd__(self, other):
474        # Set union, in-place
475        for x in other.addresslist:
476            if not x in self.addresslist:
477                self.addresslist.append(x)
478        return self
479
480    def __sub__(self, other):
481        # Set difference
482        newaddr = AddressList(None)
483        for x in self.addresslist:
484            if not x in other.addresslist:
485                newaddr.addresslist.append(x)
486        return newaddr
487
488    def __isub__(self, other):
489        # Set difference, in-place
490        for x in other.addresslist:
491            if x in self.addresslist:
492                self.addresslist.remove(x)
493        return self
494
495    def __getitem__(self, index):
496        # Make indexing, slices, and 'in' work
497        return self.addresslist[index]
498