1# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line.  This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser.  It returns when there's nothing more it can do with the available
13data.  When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never raise a parsing
17exception.  Instead, when it finds something unexpected, it adds a 'defect' to
18the current message.  Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser', 'BytesFeedParser']
23
24import re
25
26from email import errors
27from email._policybase import compat32
28from collections import deque
29from io import StringIO
30
31NLCRE = re.compile(r'\r\n|\r|\n')
32NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
34NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
35# RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
36# except controls, SP, and ":".
37headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
38EMPTYSTRING = ''
39NL = '\n'
40
41NeedMoreData = object()
42
43
44
45class BufferedSubFile(object):
46    """A file-ish object that can have new data loaded into it.
47
48    You can also push and pop line-matching predicates onto a stack.  When the
49    current predicate matches the current line, a false EOF response
50    (i.e. empty string) is returned instead.  This lets the parser adhere to a
51    simple abstraction -- it parses until EOF closes the current message.
52    """
53    def __init__(self):
54        # Text stream of the last partial line pushed into this object.
55        # See issue 22233 for why this is a text stream and not a list.
56        self._partial = StringIO(newline='')
57        # A deque of full, pushed lines
58        self._lines = deque()
59        # The stack of false-EOF checking predicates.
60        self._eofstack = []
61        # A flag indicating whether the file has been closed or not.
62        self._closed = False
63
64    def push_eof_matcher(self, pred):
65        self._eofstack.append(pred)
66
67    def pop_eof_matcher(self):
68        return self._eofstack.pop()
69
70    def close(self):
71        # Don't forget any trailing partial line.
72        self._partial.seek(0)
73        self.pushlines(self._partial.readlines())
74        self._partial.seek(0)
75        self._partial.truncate()
76        self._closed = True
77
78    def readline(self):
79        if not self._lines:
80            if self._closed:
81                return ''
82            return NeedMoreData
83        # Pop the line off the stack and see if it matches the current
84        # false-EOF predicate.
85        line = self._lines.popleft()
86        # RFC 2046, section 5.1.2 requires us to recognize outer level
87        # boundaries at any level of inner nesting.  Do this, but be sure it's
88        # in the order of most to least nested.
89        for ateof in reversed(self._eofstack):
90            if ateof(line):
91                # We're at the false EOF.  But push the last line back first.
92                self._lines.appendleft(line)
93                return ''
94        return line
95
96    def unreadline(self, line):
97        # Let the consumer push a line back into the buffer.
98        assert line is not NeedMoreData
99        self._lines.appendleft(line)
100
101    def push(self, data):
102        """Push some new data into this object."""
103        self._partial.write(data)
104        if '\n' not in data and '\r' not in data:
105            # No new complete lines, wait for more.
106            return
107
108        # Crack into lines, preserving the linesep characters.
109        self._partial.seek(0)
110        parts = self._partial.readlines()
111        self._partial.seek(0)
112        self._partial.truncate()
113
114        # If the last element of the list does not end in a newline, then treat
115        # it as a partial line.  We only check for '\n' here because a line
116        # ending with '\r' might be a line that was split in the middle of a
117        # '\r\n' sequence (see bugs 1555570 and 1721862).
118        if not parts[-1].endswith('\n'):
119            self._partial.write(parts.pop())
120        self.pushlines(parts)
121
122    def pushlines(self, lines):
123        self._lines.extend(lines)
124
125    def __iter__(self):
126        return self
127
128    def __next__(self):
129        line = self.readline()
130        if line == '':
131            raise StopIteration
132        return line
133
134
135
136class FeedParser:
137    """A feed-style parser of email."""
138
139    def __init__(self, _factory=None, *, policy=compat32):
140        """_factory is called with no arguments to create a new message obj
141
142        The policy keyword specifies a policy object that controls a number of
143        aspects of the parser's operation.  The default policy maintains
144        backward compatibility.
145
146        """
147        self.policy = policy
148        self._old_style_factory = False
149        if _factory is None:
150            if policy.message_factory is None:
151                from email.message import Message
152                self._factory = Message
153            else:
154                self._factory = policy.message_factory
155        else:
156            self._factory = _factory
157            try:
158                _factory(policy=self.policy)
159            except TypeError:
160                # Assume this is an old-style factory
161                self._old_style_factory = True
162        self._input = BufferedSubFile()
163        self._msgstack = []
164        self._parse = self._parsegen().__next__
165        self._cur = None
166        self._last = None
167        self._headersonly = False
168
169    # Non-public interface for supporting Parser's headersonly flag
170    def _set_headersonly(self):
171        self._headersonly = True
172
173    def feed(self, data):
174        """Push more data into the parser."""
175        self._input.push(data)
176        self._call_parse()
177
178    def _call_parse(self):
179        try:
180            self._parse()
181        except StopIteration:
182            pass
183
184    def close(self):
185        """Parse all remaining data and return the root message object."""
186        self._input.close()
187        self._call_parse()
188        root = self._pop_message()
189        assert not self._msgstack
190        # Look for final set of defects
191        if root.get_content_maintype() == 'multipart' \
192               and not root.is_multipart():
193            defect = errors.MultipartInvariantViolationDefect()
194            self.policy.handle_defect(root, defect)
195        return root
196
197    def _new_message(self):
198        if self._old_style_factory:
199            msg = self._factory()
200        else:
201            msg = self._factory(policy=self.policy)
202        if self._cur and self._cur.get_content_type() == 'multipart/digest':
203            msg.set_default_type('message/rfc822')
204        if self._msgstack:
205            self._msgstack[-1].attach(msg)
206        self._msgstack.append(msg)
207        self._cur = msg
208        self._last = msg
209
210    def _pop_message(self):
211        retval = self._msgstack.pop()
212        if self._msgstack:
213            self._cur = self._msgstack[-1]
214        else:
215            self._cur = None
216        return retval
217
218    def _parsegen(self):
219        # Create a new message and start by parsing headers.
220        self._new_message()
221        headers = []
222        # Collect the headers, searching for a line that doesn't match the RFC
223        # 2822 header or continuation pattern (including an empty line).
224        for line in self._input:
225            if line is NeedMoreData:
226                yield NeedMoreData
227                continue
228            if not headerRE.match(line):
229                # If we saw the RFC defined header/body separator
230                # (i.e. newline), just throw it away. Otherwise the line is
231                # part of the body so push it back.
232                if not NLCRE.match(line):
233                    defect = errors.MissingHeaderBodySeparatorDefect()
234                    self.policy.handle_defect(self._cur, defect)
235                    self._input.unreadline(line)
236                break
237            headers.append(line)
238        # Done with the headers, so parse them and figure out what we're
239        # supposed to see in the body of the message.
240        self._parse_headers(headers)
241        # Headers-only parsing is a backwards compatibility hack, which was
242        # necessary in the older parser, which could raise errors.  All
243        # remaining lines in the input are thrown into the message body.
244        if self._headersonly:
245            lines = []
246            while True:
247                line = self._input.readline()
248                if line is NeedMoreData:
249                    yield NeedMoreData
250                    continue
251                if line == '':
252                    break
253                lines.append(line)
254            self._cur.set_payload(EMPTYSTRING.join(lines))
255            return
256        if self._cur.get_content_type() == 'message/delivery-status':
257            # message/delivery-status contains blocks of headers separated by
258            # a blank line.  We'll represent each header block as a separate
259            # nested message object, but the processing is a bit different
260            # than standard message/* types because there is no body for the
261            # nested messages.  A blank line separates the subparts.
262            while True:
263                self._input.push_eof_matcher(NLCRE.match)
264                for retval in self._parsegen():
265                    if retval is NeedMoreData:
266                        yield NeedMoreData
267                        continue
268                    break
269                msg = self._pop_message()
270                # We need to pop the EOF matcher in order to tell if we're at
271                # the end of the current file, not the end of the last block
272                # of message headers.
273                self._input.pop_eof_matcher()
274                # The input stream must be sitting at the newline or at the
275                # EOF.  We want to see if we're at the end of this subpart, so
276                # first consume the blank line, then test the next line to see
277                # if we're at this subpart's EOF.
278                while True:
279                    line = self._input.readline()
280                    if line is NeedMoreData:
281                        yield NeedMoreData
282                        continue
283                    break
284                while True:
285                    line = self._input.readline()
286                    if line is NeedMoreData:
287                        yield NeedMoreData
288                        continue
289                    break
290                if line == '':
291                    break
292                # Not at EOF so this is a line we're going to need.
293                self._input.unreadline(line)
294            return
295        if self._cur.get_content_maintype() == 'message':
296            # The message claims to be a message/* type, then what follows is
297            # another RFC 2822 message.
298            for retval in self._parsegen():
299                if retval is NeedMoreData:
300                    yield NeedMoreData
301                    continue
302                break
303            self._pop_message()
304            return
305        if self._cur.get_content_maintype() == 'multipart':
306            boundary = self._cur.get_boundary()
307            if boundary is None:
308                # The message /claims/ to be a multipart but it has not
309                # defined a boundary.  That's a problem which we'll handle by
310                # reading everything until the EOF and marking the message as
311                # defective.
312                defect = errors.NoBoundaryInMultipartDefect()
313                self.policy.handle_defect(self._cur, defect)
314                lines = []
315                for line in self._input:
316                    if line is NeedMoreData:
317                        yield NeedMoreData
318                        continue
319                    lines.append(line)
320                self._cur.set_payload(EMPTYSTRING.join(lines))
321                return
322            # Make sure a valid content type was specified per RFC 2045:6.4.
323            if (self._cur.get('content-transfer-encoding', '8bit').lower()
324                    not in ('7bit', '8bit', 'binary')):
325                defect = errors.InvalidMultipartContentTransferEncodingDefect()
326                self.policy.handle_defect(self._cur, defect)
327            # Create a line match predicate which matches the inter-part
328            # boundary as well as the end-of-multipart boundary.  Don't push
329            # this onto the input stream until we've scanned past the
330            # preamble.
331            separator = '--' + boundary
332            boundaryre = re.compile(
333                '(?P<sep>' + re.escape(separator) +
334                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
335            capturing_preamble = True
336            preamble = []
337            linesep = False
338            close_boundary_seen = False
339            while True:
340                line = self._input.readline()
341                if line is NeedMoreData:
342                    yield NeedMoreData
343                    continue
344                if line == '':
345                    break
346                mo = boundaryre.match(line)
347                if mo:
348                    # If we're looking at the end boundary, we're done with
349                    # this multipart.  If there was a newline at the end of
350                    # the closing boundary, then we need to initialize the
351                    # epilogue with the empty string (see below).
352                    if mo.group('end'):
353                        close_boundary_seen = True
354                        linesep = mo.group('linesep')
355                        break
356                    # We saw an inter-part boundary.  Were we in the preamble?
357                    if capturing_preamble:
358                        if preamble:
359                            # According to RFC 2046, the last newline belongs
360                            # to the boundary.
361                            lastline = preamble[-1]
362                            eolmo = NLCRE_eol.search(lastline)
363                            if eolmo:
364                                preamble[-1] = lastline[:-len(eolmo.group(0))]
365                            self._cur.preamble = EMPTYSTRING.join(preamble)
366                        capturing_preamble = False
367                        self._input.unreadline(line)
368                        continue
369                    # We saw a boundary separating two parts.  Consume any
370                    # multiple boundary lines that may be following.  Our
371                    # interpretation of RFC 2046 BNF grammar does not produce
372                    # body parts within such double boundaries.
373                    while True:
374                        line = self._input.readline()
375                        if line is NeedMoreData:
376                            yield NeedMoreData
377                            continue
378                        mo = boundaryre.match(line)
379                        if not mo:
380                            self._input.unreadline(line)
381                            break
382                    # Recurse to parse this subpart; the input stream points
383                    # at the subpart's first line.
384                    self._input.push_eof_matcher(boundaryre.match)
385                    for retval in self._parsegen():
386                        if retval is NeedMoreData:
387                            yield NeedMoreData
388                            continue
389                        break
390                    # Because of RFC 2046, the newline preceding the boundary
391                    # separator actually belongs to the boundary, not the
392                    # previous subpart's payload (or epilogue if the previous
393                    # part is a multipart).
394                    if self._last.get_content_maintype() == 'multipart':
395                        epilogue = self._last.epilogue
396                        if epilogue == '':
397                            self._last.epilogue = None
398                        elif epilogue is not None:
399                            mo = NLCRE_eol.search(epilogue)
400                            if mo:
401                                end = len(mo.group(0))
402                                self._last.epilogue = epilogue[:-end]
403                    else:
404                        payload = self._last._payload
405                        if isinstance(payload, str):
406                            mo = NLCRE_eol.search(payload)
407                            if mo:
408                                payload = payload[:-len(mo.group(0))]
409                                self._last._payload = payload
410                    self._input.pop_eof_matcher()
411                    self._pop_message()
412                    # Set the multipart up for newline cleansing, which will
413                    # happen if we're in a nested multipart.
414                    self._last = self._cur
415                else:
416                    # I think we must be in the preamble
417                    assert capturing_preamble
418                    preamble.append(line)
419            # We've seen either the EOF or the end boundary.  If we're still
420            # capturing the preamble, we never saw the start boundary.  Note
421            # that as a defect and store the captured text as the payload.
422            if capturing_preamble:
423                defect = errors.StartBoundaryNotFoundDefect()
424                self.policy.handle_defect(self._cur, defect)
425                self._cur.set_payload(EMPTYSTRING.join(preamble))
426                epilogue = []
427                for line in self._input:
428                    if line is NeedMoreData:
429                        yield NeedMoreData
430                        continue
431                self._cur.epilogue = EMPTYSTRING.join(epilogue)
432                return
433            # If we're not processing the preamble, then we might have seen
434            # EOF without seeing that end boundary...that is also a defect.
435            if not close_boundary_seen:
436                defect = errors.CloseBoundaryNotFoundDefect()
437                self.policy.handle_defect(self._cur, defect)
438                return
439            # Everything from here to the EOF is epilogue.  If the end boundary
440            # ended in a newline, we'll need to make sure the epilogue isn't
441            # None
442            if linesep:
443                epilogue = ['']
444            else:
445                epilogue = []
446            for line in self._input:
447                if line is NeedMoreData:
448                    yield NeedMoreData
449                    continue
450                epilogue.append(line)
451            # Any CRLF at the front of the epilogue is not technically part of
452            # the epilogue.  Also, watch out for an empty string epilogue,
453            # which means a single newline.
454            if epilogue:
455                firstline = epilogue[0]
456                bolmo = NLCRE_bol.match(firstline)
457                if bolmo:
458                    epilogue[0] = firstline[len(bolmo.group(0)):]
459            self._cur.epilogue = EMPTYSTRING.join(epilogue)
460            return
461        # Otherwise, it's some non-multipart type, so the entire rest of the
462        # file contents becomes the payload.
463        lines = []
464        for line in self._input:
465            if line is NeedMoreData:
466                yield NeedMoreData
467                continue
468            lines.append(line)
469        self._cur.set_payload(EMPTYSTRING.join(lines))
470
471    def _parse_headers(self, lines):
472        # Passed a list of lines that make up the headers for the current msg
473        lastheader = ''
474        lastvalue = []
475        for lineno, line in enumerate(lines):
476            # Check for continuation
477            if line[0] in ' \t':
478                if not lastheader:
479                    # The first line of the headers was a continuation.  This
480                    # is illegal, so let's note the defect, store the illegal
481                    # line, and ignore it for purposes of headers.
482                    defect = errors.FirstHeaderLineIsContinuationDefect(line)
483                    self.policy.handle_defect(self._cur, defect)
484                    continue
485                lastvalue.append(line)
486                continue
487            if lastheader:
488                self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
489                lastheader, lastvalue = '', []
490            # Check for envelope header, i.e. unix-from
491            if line.startswith('From '):
492                if lineno == 0:
493                    # Strip off the trailing newline
494                    mo = NLCRE_eol.search(line)
495                    if mo:
496                        line = line[:-len(mo.group(0))]
497                    self._cur.set_unixfrom(line)
498                    continue
499                elif lineno == len(lines) - 1:
500                    # Something looking like a unix-from at the end - it's
501                    # probably the first line of the body, so push back the
502                    # line and stop.
503                    self._input.unreadline(line)
504                    return
505                else:
506                    # Weirdly placed unix-from line.  Note this as a defect
507                    # and ignore it.
508                    defect = errors.MisplacedEnvelopeHeaderDefect(line)
509                    self._cur.defects.append(defect)
510                    continue
511            # Split the line on the colon separating field name from value.
512            # There will always be a colon, because if there wasn't the part of
513            # the parser that calls us would have started parsing the body.
514            i = line.find(':')
515
516            # If the colon is on the start of the line the header is clearly
517            # malformed, but we might be able to salvage the rest of the
518            # message. Track the error but keep going.
519            if i == 0:
520                defect = errors.InvalidHeaderDefect("Missing header name.")
521                self._cur.defects.append(defect)
522                continue
523
524            assert i>0, "_parse_headers fed line with no : and no leading WS"
525            lastheader = line[:i]
526            lastvalue = [line]
527        # Done with all the lines, so handle the last header.
528        if lastheader:
529            self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
530
531
532class BytesFeedParser(FeedParser):
533    """Like FeedParser, but feed accepts bytes."""
534
535    def feed(self, data):
536        super().feed(data.decode('ascii', 'surrogateescape'))
537