1# Copyright (C) 2004-2006 Python Software Foundation 2# Authors: Baxter, Wouters and Warsaw 3# Contact: email-sig@python.org 4 5"""FeedParser - An email feed parser. 6 7The feed parser implements an interface for incrementally parsing an email 8message, line by line. This has advantages for certain applications, such as 9those reading email messages off a socket. 10 11FeedParser.feed() is the primary interface for pushing new data into the 12parser. It returns when there's nothing more it can do with the available 13data. When you have no more data to push into the parser, call .close(). 14This completes the parsing and returns the root message object. 15 16The other advantage of this parser is that it will never raise a parsing 17exception. Instead, when it finds something unexpected, it adds a 'defect' to 18the current message. Defects are just instances that live on the message 19object's .defects attribute. 20""" 21 22__all__ = ['FeedParser'] 23 24import re 25 26from email import errors 27from email import message 28 29NLCRE = re.compile('\r\n|\r|\n') 30NLCRE_bol = re.compile('(\r\n|\r|\n)') 31NLCRE_eol = re.compile('(\r\n|\r|\n)\Z') 32NLCRE_crack = re.compile('(\r\n|\r|\n)') 33# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 34# except controls, SP, and ":". 35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') 36EMPTYSTRING = '' 37NL = '\n' 38 39NeedMoreData = object() 40 41 42 43class BufferedSubFile(object): 44 """A file-ish object that can have new data loaded into it. 45 46 You can also push and pop line-matching predicates onto a stack. When the 47 current predicate matches the current line, a false EOF response 48 (i.e. empty string) is returned instead. This lets the parser adhere to a 49 simple abstraction -- it parses until EOF closes the current message. 50 """ 51 def __init__(self): 52 # Chunks of the last partial line pushed into this object. 53 self._partial = [] 54 # The list of full, pushed lines, in reverse order 55 self._lines = [] 56 # The stack of false-EOF checking predicates. 57 self._eofstack = [] 58 # A flag indicating whether the file has been closed or not. 59 self._closed = False 60 61 def push_eof_matcher(self, pred): 62 self._eofstack.append(pred) 63 64 def pop_eof_matcher(self): 65 return self._eofstack.pop() 66 67 def close(self): 68 # Don't forget any trailing partial line. 69 self.pushlines(''.join(self._partial).splitlines(True)) 70 self._partial = [] 71 self._closed = True 72 73 def readline(self): 74 if not self._lines: 75 if self._closed: 76 return '' 77 return NeedMoreData 78 # Pop the line off the stack and see if it matches the current 79 # false-EOF predicate. 80 line = self._lines.pop() 81 # RFC 2046, section 5.1.2 requires us to recognize outer level 82 # boundaries at any level of inner nesting. Do this, but be sure it's 83 # in the order of most to least nested. 84 for ateof in self._eofstack[::-1]: 85 if ateof(line): 86 # We're at the false EOF. But push the last line back first. 87 self._lines.append(line) 88 return '' 89 return line 90 91 def unreadline(self, line): 92 # Let the consumer push a line back into the buffer. 93 assert line is not NeedMoreData 94 self._lines.append(line) 95 96 def push(self, data): 97 """Push some new data into this object.""" 98 # Crack into lines, but preserve the linesep characters on the end of each 99 parts = data.splitlines(True) 100 101 if not parts or not parts[0].endswith(('\n', '\r')): 102 # No new complete lines, so just accumulate partials 103 self._partial += parts 104 return 105 106 if self._partial: 107 # If there are previous leftovers, complete them now 108 self._partial.append(parts[0]) 109 parts[0:1] = ''.join(self._partial).splitlines(True) 110 del self._partial[:] 111 112 # If the last element of the list does not end in a newline, then treat 113 # it as a partial line. We only check for '\n' here because a line 114 # ending with '\r' might be a line that was split in the middle of a 115 # '\r\n' sequence (see bugs 1555570 and 1721862). 116 if not parts[-1].endswith('\n'): 117 self._partial = [parts.pop()] 118 self.pushlines(parts) 119 120 def pushlines(self, lines): 121 # Crack into lines, but preserve the newlines on the end of each 122 parts = NLCRE_crack.split(data) 123 # The *ahem* interesting behaviour of re.split when supplied grouping 124 # parentheses is that the last element of the resulting list is the 125 # data after the final RE. In the case of a NL/CR terminated string, 126 # this is the empty string. 127 self._partial = parts.pop() 128 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r: 129 # is there a \n to follow later? 130 if not self._partial and parts and parts[-1].endswith('\r'): 131 self._partial = parts.pop(-2)+parts.pop() 132 # parts is a list of strings, alternating between the line contents 133 # and the eol character(s). Gather up a list of lines after 134 # re-attaching the newlines. 135 lines = [] 136 for i in range(len(parts) // 2): 137 lines.append(parts[i*2] + parts[i*2+1]) 138 self.pushlines(lines) 139 140 def pushlines(self, lines): 141 # Reverse and insert at the front of the lines. 142 self._lines[:0] = lines[::-1] 143 144 def is_closed(self): 145 return self._closed 146 147 def __iter__(self): 148 return self 149 150 def next(self): 151 line = self.readline() 152 if line == '': 153 raise StopIteration 154 return line 155 156 157 158class FeedParser: 159 """A feed-style parser of email.""" 160 161 def __init__(self, _factory=message.Message): 162 """_factory is called with no arguments to create a new message obj""" 163 self._factory = _factory 164 self._input = BufferedSubFile() 165 self._msgstack = [] 166 self._parse = self._parsegen().next 167 self._cur = None 168 self._last = None 169 self._headersonly = False 170 171 # Non-public interface for supporting Parser's headersonly flag 172 def _set_headersonly(self): 173 self._headersonly = True 174 175 def feed(self, data): 176 """Push more data into the parser.""" 177 self._input.push(data) 178 self._call_parse() 179 180 def _call_parse(self): 181 try: 182 self._parse() 183 except StopIteration: 184 pass 185 186 def close(self): 187 """Parse all remaining data and return the root message object.""" 188 self._input.close() 189 self._call_parse() 190 root = self._pop_message() 191 assert not self._msgstack 192 # Look for final set of defects 193 if root.get_content_maintype() == 'multipart' \ 194 and not root.is_multipart(): 195 root.defects.append(errors.MultipartInvariantViolationDefect()) 196 return root 197 198 def _new_message(self): 199 msg = self._factory() 200 if self._cur and self._cur.get_content_type() == 'multipart/digest': 201 msg.set_default_type('message/rfc822') 202 if self._msgstack: 203 self._msgstack[-1].attach(msg) 204 self._msgstack.append(msg) 205 self._cur = msg 206 self._last = msg 207 208 def _pop_message(self): 209 retval = self._msgstack.pop() 210 if self._msgstack: 211 self._cur = self._msgstack[-1] 212 else: 213 self._cur = None 214 return retval 215 216 def _parsegen(self): 217 # Create a new message and start by parsing headers. 218 self._new_message() 219 headers = [] 220 # Collect the headers, searching for a line that doesn't match the RFC 221 # 2822 header or continuation pattern (including an empty line). 222 for line in self._input: 223 if line is NeedMoreData: 224 yield NeedMoreData 225 continue 226 if not headerRE.match(line): 227 # If we saw the RFC defined header/body separator 228 # (i.e. newline), just throw it away. Otherwise the line is 229 # part of the body so push it back. 230 if not NLCRE.match(line): 231 self._input.unreadline(line) 232 break 233 headers.append(line) 234 # Done with the headers, so parse them and figure out what we're 235 # supposed to see in the body of the message. 236 self._parse_headers(headers) 237 # Headers-only parsing is a backwards compatibility hack, which was 238 # necessary in the older parser, which could raise errors. All 239 # remaining lines in the input are thrown into the message body. 240 if self._headersonly: 241 lines = [] 242 while True: 243 line = self._input.readline() 244 if line is NeedMoreData: 245 yield NeedMoreData 246 continue 247 if line == '': 248 break 249 lines.append(line) 250 self._cur.set_payload(EMPTYSTRING.join(lines)) 251 return 252 if self._cur.get_content_type() == 'message/delivery-status': 253 # message/delivery-status contains blocks of headers separated by 254 # a blank line. We'll represent each header block as a separate 255 # nested message object, but the processing is a bit different 256 # than standard message/* types because there is no body for the 257 # nested messages. A blank line separates the subparts. 258 while True: 259 self._input.push_eof_matcher(NLCRE.match) 260 for retval in self._parsegen(): 261 if retval is NeedMoreData: 262 yield NeedMoreData 263 continue 264 break 265 msg = self._pop_message() 266 # We need to pop the EOF matcher in order to tell if we're at 267 # the end of the current file, not the end of the last block 268 # of message headers. 269 self._input.pop_eof_matcher() 270 # The input stream must be sitting at the newline or at the 271 # EOF. We want to see if we're at the end of this subpart, so 272 # first consume the blank line, then test the next line to see 273 # if we're at this subpart's EOF. 274 while True: 275 line = self._input.readline() 276 if line is NeedMoreData: 277 yield NeedMoreData 278 continue 279 break 280 while True: 281 line = self._input.readline() 282 if line is NeedMoreData: 283 yield NeedMoreData 284 continue 285 break 286 if line == '': 287 break 288 # Not at EOF so this is a line we're going to need. 289 self._input.unreadline(line) 290 return 291 if self._cur.get_content_maintype() == 'message': 292 # The message claims to be a message/* type, then what follows is 293 # another RFC 2822 message. 294 for retval in self._parsegen(): 295 if retval is NeedMoreData: 296 yield NeedMoreData 297 continue 298 break 299 self._pop_message() 300 return 301 if self._cur.get_content_maintype() == 'multipart': 302 boundary = self._cur.get_boundary() 303 if boundary is None: 304 # The message /claims/ to be a multipart but it has not 305 # defined a boundary. That's a problem which we'll handle by 306 # reading everything until the EOF and marking the message as 307 # defective. 308 self._cur.defects.append(errors.NoBoundaryInMultipartDefect()) 309 lines = [] 310 for line in self._input: 311 if line is NeedMoreData: 312 yield NeedMoreData 313 continue 314 lines.append(line) 315 self._cur.set_payload(EMPTYSTRING.join(lines)) 316 return 317 # Create a line match predicate which matches the inter-part 318 # boundary as well as the end-of-multipart boundary. Don't push 319 # this onto the input stream until we've scanned past the 320 # preamble. 321 separator = '--' + boundary 322 boundaryre = re.compile( 323 '(?P<sep>' + re.escape(separator) + 324 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 325 capturing_preamble = True 326 preamble = [] 327 linesep = False 328 while True: 329 line = self._input.readline() 330 if line is NeedMoreData: 331 yield NeedMoreData 332 continue 333 if line == '': 334 break 335 mo = boundaryre.match(line) 336 if mo: 337 # If we're looking at the end boundary, we're done with 338 # this multipart. If there was a newline at the end of 339 # the closing boundary, then we need to initialize the 340 # epilogue with the empty string (see below). 341 if mo.group('end'): 342 linesep = mo.group('linesep') 343 break 344 # We saw an inter-part boundary. Were we in the preamble? 345 if capturing_preamble: 346 if preamble: 347 # According to RFC 2046, the last newline belongs 348 # to the boundary. 349 lastline = preamble[-1] 350 eolmo = NLCRE_eol.search(lastline) 351 if eolmo: 352 preamble[-1] = lastline[:-len(eolmo.group(0))] 353 self._cur.preamble = EMPTYSTRING.join(preamble) 354 capturing_preamble = False 355 self._input.unreadline(line) 356 continue 357 # We saw a boundary separating two parts. Consume any 358 # multiple boundary lines that may be following. Our 359 # interpretation of RFC 2046 BNF grammar does not produce 360 # body parts within such double boundaries. 361 while True: 362 line = self._input.readline() 363 if line is NeedMoreData: 364 yield NeedMoreData 365 continue 366 mo = boundaryre.match(line) 367 if not mo: 368 self._input.unreadline(line) 369 break 370 # Recurse to parse this subpart; the input stream points 371 # at the subpart's first line. 372 self._input.push_eof_matcher(boundaryre.match) 373 for retval in self._parsegen(): 374 if retval is NeedMoreData: 375 yield NeedMoreData 376 continue 377 break 378 # Because of RFC 2046, the newline preceding the boundary 379 # separator actually belongs to the boundary, not the 380 # previous subpart's payload (or epilogue if the previous 381 # part is a multipart). 382 if self._last.get_content_maintype() == 'multipart': 383 epilogue = self._last.epilogue 384 if epilogue == '': 385 self._last.epilogue = None 386 elif epilogue is not None: 387 mo = NLCRE_eol.search(epilogue) 388 if mo: 389 end = len(mo.group(0)) 390 self._last.epilogue = epilogue[:-end] 391 else: 392 payload = self._last.get_payload() 393 if isinstance(payload, basestring): 394 mo = NLCRE_eol.search(payload) 395 if mo: 396 payload = payload[:-len(mo.group(0))] 397 self._last.set_payload(payload) 398 self._input.pop_eof_matcher() 399 self._pop_message() 400 # Set the multipart up for newline cleansing, which will 401 # happen if we're in a nested multipart. 402 self._last = self._cur 403 else: 404 # I think we must be in the preamble 405 assert capturing_preamble 406 preamble.append(line) 407 # We've seen either the EOF or the end boundary. If we're still 408 # capturing the preamble, we never saw the start boundary. Note 409 # that as a defect and store the captured text as the payload. 410 # Everything from here to the EOF is epilogue. 411 if capturing_preamble: 412 self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) 413 self._cur.set_payload(EMPTYSTRING.join(preamble)) 414 epilogue = [] 415 for line in self._input: 416 if line is NeedMoreData: 417 yield NeedMoreData 418 continue 419 self._cur.epilogue = EMPTYSTRING.join(epilogue) 420 return 421 # If the end boundary ended in a newline, we'll need to make sure 422 # the epilogue isn't None 423 if linesep: 424 epilogue = [''] 425 else: 426 epilogue = [] 427 for line in self._input: 428 if line is NeedMoreData: 429 yield NeedMoreData 430 continue 431 epilogue.append(line) 432 # Any CRLF at the front of the epilogue is not technically part of 433 # the epilogue. Also, watch out for an empty string epilogue, 434 # which means a single newline. 435 if epilogue: 436 firstline = epilogue[0] 437 bolmo = NLCRE_bol.match(firstline) 438 if bolmo: 439 epilogue[0] = firstline[len(bolmo.group(0)):] 440 self._cur.epilogue = EMPTYSTRING.join(epilogue) 441 return 442 # Otherwise, it's some non-multipart type, so the entire rest of the 443 # file contents becomes the payload. 444 lines = [] 445 for line in self._input: 446 if line is NeedMoreData: 447 yield NeedMoreData 448 continue 449 lines.append(line) 450 self._cur.set_payload(EMPTYSTRING.join(lines)) 451 452 def _parse_headers(self, lines): 453 # Passed a list of lines that make up the headers for the current msg 454 lastheader = '' 455 lastvalue = [] 456 for lineno, line in enumerate(lines): 457 # Check for continuation 458 if line[0] in ' \t': 459 if not lastheader: 460 # The first line of the headers was a continuation. This 461 # is illegal, so let's note the defect, store the illegal 462 # line, and ignore it for purposes of headers. 463 defect = errors.FirstHeaderLineIsContinuationDefect(line) 464 self._cur.defects.append(defect) 465 continue 466 lastvalue.append(line) 467 continue 468 if lastheader: 469 # XXX reconsider the joining of folded lines 470 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') 471 self._cur[lastheader] = lhdr 472 lastheader, lastvalue = '', [] 473 # Check for envelope header, i.e. unix-from 474 if line.startswith('From '): 475 if lineno == 0: 476 # Strip off the trailing newline 477 mo = NLCRE_eol.search(line) 478 if mo: 479 line = line[:-len(mo.group(0))] 480 self._cur.set_unixfrom(line) 481 continue 482 elif lineno == len(lines) - 1: 483 # Something looking like a unix-from at the end - it's 484 # probably the first line of the body, so push back the 485 # line and stop. 486 self._input.unreadline(line) 487 return 488 else: 489 # Weirdly placed unix-from line. Note this as a defect 490 # and ignore it. 491 defect = errors.MisplacedEnvelopeHeaderDefect(line) 492 self._cur.defects.append(defect) 493 continue 494 # Split the line on the colon separating field name from value. 495 i = line.find(':') 496 if i < 0: 497 defect = errors.MalformedHeaderDefect(line) 498 self._cur.defects.append(defect) 499 continue 500 lastheader = line[:i] 501 lastvalue = [line[i+1:].lstrip()] 502 # Done with all the lines, so handle the last header. 503 if lastheader: 504 # XXX reconsider the joining of folded lines 505 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') 506