1# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line.  This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser.  It returns when there's nothing more it can do with the available
13data.  When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never raise a parsing
17exception.  Instead, when it finds something unexpected, it adds a 'defect' to
18the current message.  Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser']
23
24import re
25
26from email import errors
27from email import message
28
29NLCRE = re.compile('\r\n|\r|\n')
30NLCRE_bol = re.compile('(\r\n|\r|\n)')
31NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
32NLCRE_crack = re.compile('(\r\n|\r|\n)')
33# RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
34# except controls, SP, and ":".
35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
36EMPTYSTRING = ''
37NL = '\n'
38
39NeedMoreData = object()
40
41
42
43class BufferedSubFile(object):
44    """A file-ish object that can have new data loaded into it.
45
46    You can also push and pop line-matching predicates onto a stack.  When the
47    current predicate matches the current line, a false EOF response
48    (i.e. empty string) is returned instead.  This lets the parser adhere to a
49    simple abstraction -- it parses until EOF closes the current message.
50    """
51    def __init__(self):
52        # Chunks of the last partial line pushed into this object.
53        self._partial = []
54        # The list of full, pushed lines, in reverse order
55        self._lines = []
56        # The stack of false-EOF checking predicates.
57        self._eofstack = []
58        # A flag indicating whether the file has been closed or not.
59        self._closed = False
60
61    def push_eof_matcher(self, pred):
62        self._eofstack.append(pred)
63
64    def pop_eof_matcher(self):
65        return self._eofstack.pop()
66
67    def close(self):
68        # Don't forget any trailing partial line.
69        self.pushlines(''.join(self._partial).splitlines(True))
70        self._partial = []
71        self._closed = True
72
73    def readline(self):
74        if not self._lines:
75            if self._closed:
76                return ''
77            return NeedMoreData
78        # Pop the line off the stack and see if it matches the current
79        # false-EOF predicate.
80        line = self._lines.pop()
81        # RFC 2046, section 5.1.2 requires us to recognize outer level
82        # boundaries at any level of inner nesting.  Do this, but be sure it's
83        # in the order of most to least nested.
84        for ateof in self._eofstack[::-1]:
85            if ateof(line):
86                # We're at the false EOF.  But push the last line back first.
87                self._lines.append(line)
88                return ''
89        return line
90
91    def unreadline(self, line):
92        # Let the consumer push a line back into the buffer.
93        assert line is not NeedMoreData
94        self._lines.append(line)
95
96    def push(self, data):
97        """Push some new data into this object."""
98        # Crack into lines, but preserve the linesep characters on the end of each
99        parts = data.splitlines(True)
100
101        if not parts or not parts[0].endswith(('\n', '\r')):
102            # No new complete lines, so just accumulate partials
103            self._partial += parts
104            return
105
106        if self._partial:
107            # If there are previous leftovers, complete them now
108            self._partial.append(parts[0])
109            parts[0:1] = ''.join(self._partial).splitlines(True)
110            del self._partial[:]
111
112        # If the last element of the list does not end in a newline, then treat
113        # it as a partial line.  We only check for '\n' here because a line
114        # ending with '\r' might be a line that was split in the middle of a
115        # '\r\n' sequence (see bugs 1555570 and 1721862).
116        if not parts[-1].endswith('\n'):
117            self._partial = [parts.pop()]
118        self.pushlines(parts)
119
120    def pushlines(self, lines):
121        # Crack into lines, but preserve the newlines on the end of each
122        parts = NLCRE_crack.split(data)
123        # The *ahem* interesting behaviour of re.split when supplied grouping
124        # parentheses is that the last element of the resulting list is the
125        # data after the final RE.  In the case of a NL/CR terminated string,
126        # this is the empty string.
127        self._partial = parts.pop()
128        #GAN 29Mar09  bugs 1555570, 1721862  Confusion at 8K boundary ending with \r:
129        # is there a \n to follow later?
130        if not self._partial and parts and parts[-1].endswith('\r'):
131            self._partial = parts.pop(-2)+parts.pop()
132        # parts is a list of strings, alternating between the line contents
133        # and the eol character(s).  Gather up a list of lines after
134        # re-attaching the newlines.
135        lines = []
136        for i in range(len(parts) // 2):
137            lines.append(parts[i*2] + parts[i*2+1])
138        self.pushlines(lines)
139
140    def pushlines(self, lines):
141        # Reverse and insert at the front of the lines.
142        self._lines[:0] = lines[::-1]
143
144    def is_closed(self):
145        return self._closed
146
147    def __iter__(self):
148        return self
149
150    def next(self):
151        line = self.readline()
152        if line == '':
153            raise StopIteration
154        return line
155
156
157
158class FeedParser:
159    """A feed-style parser of email."""
160
161    def __init__(self, _factory=message.Message):
162        """_factory is called with no arguments to create a new message obj"""
163        self._factory = _factory
164        self._input = BufferedSubFile()
165        self._msgstack = []
166        self._parse = self._parsegen().next
167        self._cur = None
168        self._last = None
169        self._headersonly = False
170
171    # Non-public interface for supporting Parser's headersonly flag
172    def _set_headersonly(self):
173        self._headersonly = True
174
175    def feed(self, data):
176        """Push more data into the parser."""
177        self._input.push(data)
178        self._call_parse()
179
180    def _call_parse(self):
181        try:
182            self._parse()
183        except StopIteration:
184            pass
185
186    def close(self):
187        """Parse all remaining data and return the root message object."""
188        self._input.close()
189        self._call_parse()
190        root = self._pop_message()
191        assert not self._msgstack
192        # Look for final set of defects
193        if root.get_content_maintype() == 'multipart' \
194               and not root.is_multipart():
195            root.defects.append(errors.MultipartInvariantViolationDefect())
196        return root
197
198    def _new_message(self):
199        msg = self._factory()
200        if self._cur and self._cur.get_content_type() == 'multipart/digest':
201            msg.set_default_type('message/rfc822')
202        if self._msgstack:
203            self._msgstack[-1].attach(msg)
204        self._msgstack.append(msg)
205        self._cur = msg
206        self._last = msg
207
208    def _pop_message(self):
209        retval = self._msgstack.pop()
210        if self._msgstack:
211            self._cur = self._msgstack[-1]
212        else:
213            self._cur = None
214        return retval
215
216    def _parsegen(self):
217        # Create a new message and start by parsing headers.
218        self._new_message()
219        headers = []
220        # Collect the headers, searching for a line that doesn't match the RFC
221        # 2822 header or continuation pattern (including an empty line).
222        for line in self._input:
223            if line is NeedMoreData:
224                yield NeedMoreData
225                continue
226            if not headerRE.match(line):
227                # If we saw the RFC defined header/body separator
228                # (i.e. newline), just throw it away. Otherwise the line is
229                # part of the body so push it back.
230                if not NLCRE.match(line):
231                    self._input.unreadline(line)
232                break
233            headers.append(line)
234        # Done with the headers, so parse them and figure out what we're
235        # supposed to see in the body of the message.
236        self._parse_headers(headers)
237        # Headers-only parsing is a backwards compatibility hack, which was
238        # necessary in the older parser, which could raise errors.  All
239        # remaining lines in the input are thrown into the message body.
240        if self._headersonly:
241            lines = []
242            while True:
243                line = self._input.readline()
244                if line is NeedMoreData:
245                    yield NeedMoreData
246                    continue
247                if line == '':
248                    break
249                lines.append(line)
250            self._cur.set_payload(EMPTYSTRING.join(lines))
251            return
252        if self._cur.get_content_type() == 'message/delivery-status':
253            # message/delivery-status contains blocks of headers separated by
254            # a blank line.  We'll represent each header block as a separate
255            # nested message object, but the processing is a bit different
256            # than standard message/* types because there is no body for the
257            # nested messages.  A blank line separates the subparts.
258            while True:
259                self._input.push_eof_matcher(NLCRE.match)
260                for retval in self._parsegen():
261                    if retval is NeedMoreData:
262                        yield NeedMoreData
263                        continue
264                    break
265                msg = self._pop_message()
266                # We need to pop the EOF matcher in order to tell if we're at
267                # the end of the current file, not the end of the last block
268                # of message headers.
269                self._input.pop_eof_matcher()
270                # The input stream must be sitting at the newline or at the
271                # EOF.  We want to see if we're at the end of this subpart, so
272                # first consume the blank line, then test the next line to see
273                # if we're at this subpart's EOF.
274                while True:
275                    line = self._input.readline()
276                    if line is NeedMoreData:
277                        yield NeedMoreData
278                        continue
279                    break
280                while True:
281                    line = self._input.readline()
282                    if line is NeedMoreData:
283                        yield NeedMoreData
284                        continue
285                    break
286                if line == '':
287                    break
288                # Not at EOF so this is a line we're going to need.
289                self._input.unreadline(line)
290            return
291        if self._cur.get_content_maintype() == 'message':
292            # The message claims to be a message/* type, then what follows is
293            # another RFC 2822 message.
294            for retval in self._parsegen():
295                if retval is NeedMoreData:
296                    yield NeedMoreData
297                    continue
298                break
299            self._pop_message()
300            return
301        if self._cur.get_content_maintype() == 'multipart':
302            boundary = self._cur.get_boundary()
303            if boundary is None:
304                # The message /claims/ to be a multipart but it has not
305                # defined a boundary.  That's a problem which we'll handle by
306                # reading everything until the EOF and marking the message as
307                # defective.
308                self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
309                lines = []
310                for line in self._input:
311                    if line is NeedMoreData:
312                        yield NeedMoreData
313                        continue
314                    lines.append(line)
315                self._cur.set_payload(EMPTYSTRING.join(lines))
316                return
317            # Create a line match predicate which matches the inter-part
318            # boundary as well as the end-of-multipart boundary.  Don't push
319            # this onto the input stream until we've scanned past the
320            # preamble.
321            separator = '--' + boundary
322            boundaryre = re.compile(
323                '(?P<sep>' + re.escape(separator) +
324                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
325            capturing_preamble = True
326            preamble = []
327            linesep = False
328            while True:
329                line = self._input.readline()
330                if line is NeedMoreData:
331                    yield NeedMoreData
332                    continue
333                if line == '':
334                    break
335                mo = boundaryre.match(line)
336                if mo:
337                    # If we're looking at the end boundary, we're done with
338                    # this multipart.  If there was a newline at the end of
339                    # the closing boundary, then we need to initialize the
340                    # epilogue with the empty string (see below).
341                    if mo.group('end'):
342                        linesep = mo.group('linesep')
343                        break
344                    # We saw an inter-part boundary.  Were we in the preamble?
345                    if capturing_preamble:
346                        if preamble:
347                            # According to RFC 2046, the last newline belongs
348                            # to the boundary.
349                            lastline = preamble[-1]
350                            eolmo = NLCRE_eol.search(lastline)
351                            if eolmo:
352                                preamble[-1] = lastline[:-len(eolmo.group(0))]
353                            self._cur.preamble = EMPTYSTRING.join(preamble)
354                        capturing_preamble = False
355                        self._input.unreadline(line)
356                        continue
357                    # We saw a boundary separating two parts.  Consume any
358                    # multiple boundary lines that may be following.  Our
359                    # interpretation of RFC 2046 BNF grammar does not produce
360                    # body parts within such double boundaries.
361                    while True:
362                        line = self._input.readline()
363                        if line is NeedMoreData:
364                            yield NeedMoreData
365                            continue
366                        mo = boundaryre.match(line)
367                        if not mo:
368                            self._input.unreadline(line)
369                            break
370                    # Recurse to parse this subpart; the input stream points
371                    # at the subpart's first line.
372                    self._input.push_eof_matcher(boundaryre.match)
373                    for retval in self._parsegen():
374                        if retval is NeedMoreData:
375                            yield NeedMoreData
376                            continue
377                        break
378                    # Because of RFC 2046, the newline preceding the boundary
379                    # separator actually belongs to the boundary, not the
380                    # previous subpart's payload (or epilogue if the previous
381                    # part is a multipart).
382                    if self._last.get_content_maintype() == 'multipart':
383                        epilogue = self._last.epilogue
384                        if epilogue == '':
385                            self._last.epilogue = None
386                        elif epilogue is not None:
387                            mo = NLCRE_eol.search(epilogue)
388                            if mo:
389                                end = len(mo.group(0))
390                                self._last.epilogue = epilogue[:-end]
391                    else:
392                        payload = self._last.get_payload()
393                        if isinstance(payload, basestring):
394                            mo = NLCRE_eol.search(payload)
395                            if mo:
396                                payload = payload[:-len(mo.group(0))]
397                                self._last.set_payload(payload)
398                    self._input.pop_eof_matcher()
399                    self._pop_message()
400                    # Set the multipart up for newline cleansing, which will
401                    # happen if we're in a nested multipart.
402                    self._last = self._cur
403                else:
404                    # I think we must be in the preamble
405                    assert capturing_preamble
406                    preamble.append(line)
407            # We've seen either the EOF or the end boundary.  If we're still
408            # capturing the preamble, we never saw the start boundary.  Note
409            # that as a defect and store the captured text as the payload.
410            # Everything from here to the EOF is epilogue.
411            if capturing_preamble:
412                self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
413                self._cur.set_payload(EMPTYSTRING.join(preamble))
414                epilogue = []
415                for line in self._input:
416                    if line is NeedMoreData:
417                        yield NeedMoreData
418                        continue
419                self._cur.epilogue = EMPTYSTRING.join(epilogue)
420                return
421            # If the end boundary ended in a newline, we'll need to make sure
422            # the epilogue isn't None
423            if linesep:
424                epilogue = ['']
425            else:
426                epilogue = []
427            for line in self._input:
428                if line is NeedMoreData:
429                    yield NeedMoreData
430                    continue
431                epilogue.append(line)
432            # Any CRLF at the front of the epilogue is not technically part of
433            # the epilogue.  Also, watch out for an empty string epilogue,
434            # which means a single newline.
435            if epilogue:
436                firstline = epilogue[0]
437                bolmo = NLCRE_bol.match(firstline)
438                if bolmo:
439                    epilogue[0] = firstline[len(bolmo.group(0)):]
440            self._cur.epilogue = EMPTYSTRING.join(epilogue)
441            return
442        # Otherwise, it's some non-multipart type, so the entire rest of the
443        # file contents becomes the payload.
444        lines = []
445        for line in self._input:
446            if line is NeedMoreData:
447                yield NeedMoreData
448                continue
449            lines.append(line)
450        self._cur.set_payload(EMPTYSTRING.join(lines))
451
452    def _parse_headers(self, lines):
453        # Passed a list of lines that make up the headers for the current msg
454        lastheader = ''
455        lastvalue = []
456        for lineno, line in enumerate(lines):
457            # Check for continuation
458            if line[0] in ' \t':
459                if not lastheader:
460                    # The first line of the headers was a continuation.  This
461                    # is illegal, so let's note the defect, store the illegal
462                    # line, and ignore it for purposes of headers.
463                    defect = errors.FirstHeaderLineIsContinuationDefect(line)
464                    self._cur.defects.append(defect)
465                    continue
466                lastvalue.append(line)
467                continue
468            if lastheader:
469                # XXX reconsider the joining of folded lines
470                lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
471                self._cur[lastheader] = lhdr
472                lastheader, lastvalue = '', []
473            # Check for envelope header, i.e. unix-from
474            if line.startswith('From '):
475                if lineno == 0:
476                    # Strip off the trailing newline
477                    mo = NLCRE_eol.search(line)
478                    if mo:
479                        line = line[:-len(mo.group(0))]
480                    self._cur.set_unixfrom(line)
481                    continue
482                elif lineno == len(lines) - 1:
483                    # Something looking like a unix-from at the end - it's
484                    # probably the first line of the body, so push back the
485                    # line and stop.
486                    self._input.unreadline(line)
487                    return
488                else:
489                    # Weirdly placed unix-from line.  Note this as a defect
490                    # and ignore it.
491                    defect = errors.MisplacedEnvelopeHeaderDefect(line)
492                    self._cur.defects.append(defect)
493                    continue
494            # Split the line on the colon separating field name from value.
495            i = line.find(':')
496            if i < 0:
497                defect = errors.MalformedHeaderDefect(line)
498                self._cur.defects.append(defect)
499                continue
500            lastheader = line[:i]
501            lastvalue = [line[i+1:].lstrip()]
502        # Done with all the lines, so handle the last header.
503        if lastheader:
504            # XXX reconsider the joining of folded lines
505            self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
506