1"""Header value parser implementing various email-related RFC parsing rules. 2 3The parsing methods defined in this module implement various email related 4parsing rules. Principal among them is RFC 5322, which is the followon 5to RFC 2822 and primarily a clarification of the former. It also implements 6RFC 2047 encoded word decoding. 7 8RFC 5322 goes to considerable trouble to maintain backward compatibility with 9RFC 822 in the parse phase, while cleaning up the structure on the generation 10phase. This parser supports correct RFC 5322 generation by tagging white space 11as folding white space only when folding is allowed in the non-obsolete rule 12sets. Actually, the parser is even more generous when accepting input than RFC 135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. 14Where possible deviations from the standard are annotated on the 'defects' 15attribute of tokens that deviate. 16 17The general structure of the parser follows RFC 5322, and uses its terminology 18where there is a direct correspondence. Where the implementation requires a 19somewhat different structure than that used by the formal grammar, new terms 20that mimic the closest existing terms are used. Thus, it really helps to have 21a copy of RFC 5322 handy when studying this code. 22 23Input to the parser is a string that has already been unfolded according to 24RFC 5322 rules. According to the RFC this unfolding is the very first step, and 25this parser leaves the unfolding step to a higher level message parser, which 26will have already detected the line breaks that need unfolding while 27determining the beginning and end of each header. 28 29The output of the parser is a TokenList object, which is a list subclass. A 30TokenList is a recursive data structure. The terminal nodes of the structure 31are Terminal objects, which are subclasses of str. These do not correspond 32directly to terminal objects in the formal grammar, but are instead more 33practical higher level combinations of true terminals. 34 35All TokenList and Terminal objects have a 'value' attribute, which produces the 36semantically meaningful value of that part of the parse subtree. The value of 37all whitespace tokens (no matter how many sub-tokens they may contain) is a 38single space, as per the RFC rules. This includes 'CFWS', which is herein 39included in the general class of whitespace tokens. There is one exception to 40the rule that whitespace tokens are collapsed into single spaces in values: in 41the value of a 'bare-quoted-string' (a quoted-string with no leading or 42trailing whitespace), any whitespace that appeared between the quotation marks 43is preserved in the returned value. Note that in all Terminal strings quoted 44pairs are turned into their unquoted values. 45 46All TokenList and Terminal objects also have a string value, which attempts to 47be a "canonical" representation of the RFC-compliant form of the substring that 48produced the parsed subtree, including minimal use of quoted pair quoting. 49Whitespace runs are not collapsed. 50 51Comment tokens also have a 'content' attribute providing the string found 52between the parens (including any nested comments) with whitespace preserved. 53 54All TokenList and Terminal objects have a 'defects' attribute which is a 55possibly empty list all of the defects found while creating the token. Defects 56may appear on any token in the tree, and a composite list of all defects in the 57subtree is available through the 'all_defects' attribute of any node. (For 58Terminal notes x.defects == x.all_defects.) 59 60Each object in a parse tree is called a 'token', and each has a 'token_type' 61attribute that gives the name from the RFC 5322 grammar that it represents. 62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that 63may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. 64It is returned in place of lists of (ctext/quoted-pair) and 65(qtext/quoted-pair). 66 67XXX: provide complete list of token types. 68""" 69 70import re 71import urllib # For urllib.parse.unquote 72from string import hexdigits 73from collections import OrderedDict 74from operator import itemgetter 75from email import _encoded_words as _ew 76from email import errors 77from email import utils 78 79# 80# Useful constants and functions 81# 82 83WSP = set(' \t') 84CFWS_LEADER = WSP | set('(') 85SPECIALS = set(r'()<>@,:;.\"[]') 86ATOM_ENDS = SPECIALS | WSP 87DOT_ATOM_ENDS = ATOM_ENDS - set('.') 88# '.', '"', and '(' do not end phrases in order to support obs-phrase 89PHRASE_ENDS = SPECIALS - set('."(') 90TSPECIALS = (SPECIALS | set('/?=')) - set('.') 91TOKEN_ENDS = TSPECIALS | WSP 92ASPECIALS = TSPECIALS | set("*'%") 93ATTRIBUTE_ENDS = ASPECIALS | WSP 94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') 95 96def quote_string(value): 97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' 98 99# 100# TokenList and its subclasses 101# 102 103class TokenList(list): 104 105 token_type = None 106 syntactic_break = True 107 ew_combine_allowed = True 108 109 def __init__(self, *args, **kw): 110 super().__init__(*args, **kw) 111 self.defects = [] 112 113 def __str__(self): 114 return ''.join(str(x) for x in self) 115 116 def __repr__(self): 117 return '{}({})'.format(self.__class__.__name__, 118 super().__repr__()) 119 120 @property 121 def value(self): 122 return ''.join(x.value for x in self if x.value) 123 124 @property 125 def all_defects(self): 126 return sum((x.all_defects for x in self), self.defects) 127 128 def startswith_fws(self): 129 return self[0].startswith_fws() 130 131 @property 132 def as_ew_allowed(self): 133 """True if all top level tokens of this part may be RFC2047 encoded.""" 134 return all(part.as_ew_allowed for part in self) 135 136 @property 137 def comments(self): 138 comments = [] 139 for token in self: 140 comments.extend(token.comments) 141 return comments 142 143 def fold(self, *, policy): 144 return _refold_parse_tree(self, policy=policy) 145 146 def pprint(self, indent=''): 147 print(self.ppstr(indent=indent)) 148 149 def ppstr(self, indent=''): 150 return '\n'.join(self._pp(indent=indent)) 151 152 def _pp(self, indent=''): 153 yield '{}{}/{}('.format( 154 indent, 155 self.__class__.__name__, 156 self.token_type) 157 for token in self: 158 if not hasattr(token, '_pp'): 159 yield (indent + ' !! invalid element in token ' 160 'list: {!r}'.format(token)) 161 else: 162 yield from token._pp(indent+' ') 163 if self.defects: 164 extra = ' Defects: {}'.format(self.defects) 165 else: 166 extra = '' 167 yield '{}){}'.format(indent, extra) 168 169 170class WhiteSpaceTokenList(TokenList): 171 172 @property 173 def value(self): 174 return ' ' 175 176 @property 177 def comments(self): 178 return [x.content for x in self if x.token_type=='comment'] 179 180 181class UnstructuredTokenList(TokenList): 182 183 token_type = 'unstructured' 184 185 186class Phrase(TokenList): 187 188 token_type = 'phrase' 189 190class Word(TokenList): 191 192 token_type = 'word' 193 194 195class CFWSList(WhiteSpaceTokenList): 196 197 token_type = 'cfws' 198 199 200class Atom(TokenList): 201 202 token_type = 'atom' 203 204 205class Token(TokenList): 206 207 token_type = 'token' 208 encode_as_ew = False 209 210 211class EncodedWord(TokenList): 212 213 token_type = 'encoded-word' 214 cte = None 215 charset = None 216 lang = None 217 218 219class QuotedString(TokenList): 220 221 token_type = 'quoted-string' 222 223 @property 224 def content(self): 225 for x in self: 226 if x.token_type == 'bare-quoted-string': 227 return x.value 228 229 @property 230 def quoted_value(self): 231 res = [] 232 for x in self: 233 if x.token_type == 'bare-quoted-string': 234 res.append(str(x)) 235 else: 236 res.append(x.value) 237 return ''.join(res) 238 239 @property 240 def stripped_value(self): 241 for token in self: 242 if token.token_type == 'bare-quoted-string': 243 return token.value 244 245 246class BareQuotedString(QuotedString): 247 248 token_type = 'bare-quoted-string' 249 250 def __str__(self): 251 return quote_string(''.join(str(x) for x in self)) 252 253 @property 254 def value(self): 255 return ''.join(str(x) for x in self) 256 257 258class Comment(WhiteSpaceTokenList): 259 260 token_type = 'comment' 261 262 def __str__(self): 263 return ''.join(sum([ 264 ["("], 265 [self.quote(x) for x in self], 266 [")"], 267 ], [])) 268 269 def quote(self, value): 270 if value.token_type == 'comment': 271 return str(value) 272 return str(value).replace('\\', '\\\\').replace( 273 '(', r'\(').replace( 274 ')', r'\)') 275 276 @property 277 def content(self): 278 return ''.join(str(x) for x in self) 279 280 @property 281 def comments(self): 282 return [self.content] 283 284class AddressList(TokenList): 285 286 token_type = 'address-list' 287 288 @property 289 def addresses(self): 290 return [x for x in self if x.token_type=='address'] 291 292 @property 293 def mailboxes(self): 294 return sum((x.mailboxes 295 for x in self if x.token_type=='address'), []) 296 297 @property 298 def all_mailboxes(self): 299 return sum((x.all_mailboxes 300 for x in self if x.token_type=='address'), []) 301 302 303class Address(TokenList): 304 305 token_type = 'address' 306 307 @property 308 def display_name(self): 309 if self[0].token_type == 'group': 310 return self[0].display_name 311 312 @property 313 def mailboxes(self): 314 if self[0].token_type == 'mailbox': 315 return [self[0]] 316 elif self[0].token_type == 'invalid-mailbox': 317 return [] 318 return self[0].mailboxes 319 320 @property 321 def all_mailboxes(self): 322 if self[0].token_type == 'mailbox': 323 return [self[0]] 324 elif self[0].token_type == 'invalid-mailbox': 325 return [self[0]] 326 return self[0].all_mailboxes 327 328class MailboxList(TokenList): 329 330 token_type = 'mailbox-list' 331 332 @property 333 def mailboxes(self): 334 return [x for x in self if x.token_type=='mailbox'] 335 336 @property 337 def all_mailboxes(self): 338 return [x for x in self 339 if x.token_type in ('mailbox', 'invalid-mailbox')] 340 341 342class GroupList(TokenList): 343 344 token_type = 'group-list' 345 346 @property 347 def mailboxes(self): 348 if not self or self[0].token_type != 'mailbox-list': 349 return [] 350 return self[0].mailboxes 351 352 @property 353 def all_mailboxes(self): 354 if not self or self[0].token_type != 'mailbox-list': 355 return [] 356 return self[0].all_mailboxes 357 358 359class Group(TokenList): 360 361 token_type = "group" 362 363 @property 364 def mailboxes(self): 365 if self[2].token_type != 'group-list': 366 return [] 367 return self[2].mailboxes 368 369 @property 370 def all_mailboxes(self): 371 if self[2].token_type != 'group-list': 372 return [] 373 return self[2].all_mailboxes 374 375 @property 376 def display_name(self): 377 return self[0].display_name 378 379 380class NameAddr(TokenList): 381 382 token_type = 'name-addr' 383 384 @property 385 def display_name(self): 386 if len(self) == 1: 387 return None 388 return self[0].display_name 389 390 @property 391 def local_part(self): 392 return self[-1].local_part 393 394 @property 395 def domain(self): 396 return self[-1].domain 397 398 @property 399 def route(self): 400 return self[-1].route 401 402 @property 403 def addr_spec(self): 404 return self[-1].addr_spec 405 406 407class AngleAddr(TokenList): 408 409 token_type = 'angle-addr' 410 411 @property 412 def local_part(self): 413 for x in self: 414 if x.token_type == 'addr-spec': 415 return x.local_part 416 417 @property 418 def domain(self): 419 for x in self: 420 if x.token_type == 'addr-spec': 421 return x.domain 422 423 @property 424 def route(self): 425 for x in self: 426 if x.token_type == 'obs-route': 427 return x.domains 428 429 @property 430 def addr_spec(self): 431 for x in self: 432 if x.token_type == 'addr-spec': 433 if x.local_part: 434 return x.addr_spec 435 else: 436 return quote_string(x.local_part) + x.addr_spec 437 else: 438 return '<>' 439 440 441class ObsRoute(TokenList): 442 443 token_type = 'obs-route' 444 445 @property 446 def domains(self): 447 return [x.domain for x in self if x.token_type == 'domain'] 448 449 450class Mailbox(TokenList): 451 452 token_type = 'mailbox' 453 454 @property 455 def display_name(self): 456 if self[0].token_type == 'name-addr': 457 return self[0].display_name 458 459 @property 460 def local_part(self): 461 return self[0].local_part 462 463 @property 464 def domain(self): 465 return self[0].domain 466 467 @property 468 def route(self): 469 if self[0].token_type == 'name-addr': 470 return self[0].route 471 472 @property 473 def addr_spec(self): 474 return self[0].addr_spec 475 476 477class InvalidMailbox(TokenList): 478 479 token_type = 'invalid-mailbox' 480 481 @property 482 def display_name(self): 483 return None 484 485 local_part = domain = route = addr_spec = display_name 486 487 488class Domain(TokenList): 489 490 token_type = 'domain' 491 as_ew_allowed = False 492 493 @property 494 def domain(self): 495 return ''.join(super().value.split()) 496 497 498class DotAtom(TokenList): 499 500 token_type = 'dot-atom' 501 502 503class DotAtomText(TokenList): 504 505 token_type = 'dot-atom-text' 506 as_ew_allowed = True 507 508 509class AddrSpec(TokenList): 510 511 token_type = 'addr-spec' 512 as_ew_allowed = False 513 514 @property 515 def local_part(self): 516 return self[0].local_part 517 518 @property 519 def domain(self): 520 if len(self) < 3: 521 return None 522 return self[-1].domain 523 524 @property 525 def value(self): 526 if len(self) < 3: 527 return self[0].value 528 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() 529 530 @property 531 def addr_spec(self): 532 nameset = set(self.local_part) 533 if len(nameset) > len(nameset-DOT_ATOM_ENDS): 534 lp = quote_string(self.local_part) 535 else: 536 lp = self.local_part 537 if self.domain is not None: 538 return lp + '@' + self.domain 539 return lp 540 541 542class ObsLocalPart(TokenList): 543 544 token_type = 'obs-local-part' 545 as_ew_allowed = False 546 547 548class DisplayName(Phrase): 549 550 token_type = 'display-name' 551 ew_combine_allowed = False 552 553 @property 554 def display_name(self): 555 res = TokenList(self) 556 if res[0].token_type == 'cfws': 557 res.pop(0) 558 else: 559 if res[0][0].token_type == 'cfws': 560 res[0] = TokenList(res[0][1:]) 561 if res[-1].token_type == 'cfws': 562 res.pop() 563 else: 564 if res[-1][-1].token_type == 'cfws': 565 res[-1] = TokenList(res[-1][:-1]) 566 return res.value 567 568 @property 569 def value(self): 570 quote = False 571 if self.defects: 572 quote = True 573 else: 574 for x in self: 575 if x.token_type == 'quoted-string': 576 quote = True 577 if quote: 578 pre = post = '' 579 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws': 580 pre = ' ' 581 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws': 582 post = ' ' 583 return pre+quote_string(self.display_name)+post 584 else: 585 return super().value 586 587 588class LocalPart(TokenList): 589 590 token_type = 'local-part' 591 as_ew_allowed = False 592 593 @property 594 def value(self): 595 if self[0].token_type == "quoted-string": 596 return self[0].quoted_value 597 else: 598 return self[0].value 599 600 @property 601 def local_part(self): 602 # Strip whitespace from front, back, and around dots. 603 res = [DOT] 604 last = DOT 605 last_is_tl = False 606 for tok in self[0] + [DOT]: 607 if tok.token_type == 'cfws': 608 continue 609 if (last_is_tl and tok.token_type == 'dot' and 610 last[-1].token_type == 'cfws'): 611 res[-1] = TokenList(last[:-1]) 612 is_tl = isinstance(tok, TokenList) 613 if (is_tl and last.token_type == 'dot' and 614 tok[0].token_type == 'cfws'): 615 res.append(TokenList(tok[1:])) 616 else: 617 res.append(tok) 618 last = res[-1] 619 last_is_tl = is_tl 620 res = TokenList(res[1:-1]) 621 return res.value 622 623 624class DomainLiteral(TokenList): 625 626 token_type = 'domain-literal' 627 as_ew_allowed = False 628 629 @property 630 def domain(self): 631 return ''.join(super().value.split()) 632 633 @property 634 def ip(self): 635 for x in self: 636 if x.token_type == 'ptext': 637 return x.value 638 639 640class MIMEVersion(TokenList): 641 642 token_type = 'mime-version' 643 major = None 644 minor = None 645 646 647class Parameter(TokenList): 648 649 token_type = 'parameter' 650 sectioned = False 651 extended = False 652 charset = 'us-ascii' 653 654 @property 655 def section_number(self): 656 # Because the first token, the attribute (name) eats CFWS, the second 657 # token is always the section if there is one. 658 return self[1].number if self.sectioned else 0 659 660 @property 661 def param_value(self): 662 # This is part of the "handle quoted extended parameters" hack. 663 for token in self: 664 if token.token_type == 'value': 665 return token.stripped_value 666 if token.token_type == 'quoted-string': 667 for token in token: 668 if token.token_type == 'bare-quoted-string': 669 for token in token: 670 if token.token_type == 'value': 671 return token.stripped_value 672 return '' 673 674 675class InvalidParameter(Parameter): 676 677 token_type = 'invalid-parameter' 678 679 680class Attribute(TokenList): 681 682 token_type = 'attribute' 683 684 @property 685 def stripped_value(self): 686 for token in self: 687 if token.token_type.endswith('attrtext'): 688 return token.value 689 690class Section(TokenList): 691 692 token_type = 'section' 693 number = None 694 695 696class Value(TokenList): 697 698 token_type = 'value' 699 700 @property 701 def stripped_value(self): 702 token = self[0] 703 if token.token_type == 'cfws': 704 token = self[1] 705 if token.token_type.endswith( 706 ('quoted-string', 'attribute', 'extended-attribute')): 707 return token.stripped_value 708 return self.value 709 710 711class MimeParameters(TokenList): 712 713 token_type = 'mime-parameters' 714 syntactic_break = False 715 716 @property 717 def params(self): 718 # The RFC specifically states that the ordering of parameters is not 719 # guaranteed and may be reordered by the transport layer. So we have 720 # to assume the RFC 2231 pieces can come in any order. However, we 721 # output them in the order that we first see a given name, which gives 722 # us a stable __str__. 723 params = OrderedDict() 724 for token in self: 725 if not token.token_type.endswith('parameter'): 726 continue 727 if token[0].token_type != 'attribute': 728 continue 729 name = token[0].value.strip() 730 if name not in params: 731 params[name] = [] 732 params[name].append((token.section_number, token)) 733 for name, parts in params.items(): 734 parts = sorted(parts, key=itemgetter(0)) 735 first_param = parts[0][1] 736 charset = first_param.charset 737 # Our arbitrary error recovery is to ignore duplicate parameters, 738 # to use appearance order if there are duplicate rfc 2231 parts, 739 # and to ignore gaps. This mimics the error recovery of get_param. 740 if not first_param.extended and len(parts) > 1: 741 if parts[1][0] == 0: 742 parts[1][1].defects.append(errors.InvalidHeaderDefect( 743 'duplicate parameter name; duplicate(s) ignored')) 744 parts = parts[:1] 745 # Else assume the *0* was missing...note that this is different 746 # from get_param, but we registered a defect for this earlier. 747 value_parts = [] 748 i = 0 749 for section_number, param in parts: 750 if section_number != i: 751 # We could get fancier here and look for a complete 752 # duplicate extended parameter and ignore the second one 753 # seen. But we're not doing that. The old code didn't. 754 if not param.extended: 755 param.defects.append(errors.InvalidHeaderDefect( 756 'duplicate parameter name; duplicate ignored')) 757 continue 758 else: 759 param.defects.append(errors.InvalidHeaderDefect( 760 "inconsistent RFC2231 parameter numbering")) 761 i += 1 762 value = param.param_value 763 if param.extended: 764 try: 765 value = urllib.parse.unquote_to_bytes(value) 766 except UnicodeEncodeError: 767 # source had surrogate escaped bytes. What we do now 768 # is a bit of an open question. I'm not sure this is 769 # the best choice, but it is what the old algorithm did 770 value = urllib.parse.unquote(value, encoding='latin-1') 771 else: 772 try: 773 value = value.decode(charset, 'surrogateescape') 774 except LookupError: 775 # XXX: there should really be a custom defect for 776 # unknown character set to make it easy to find, 777 # because otherwise unknown charset is a silent 778 # failure. 779 value = value.decode('us-ascii', 'surrogateescape') 780 if utils._has_surrogates(value): 781 param.defects.append(errors.UndecodableBytesDefect()) 782 value_parts.append(value) 783 value = ''.join(value_parts) 784 yield name, value 785 786 def __str__(self): 787 params = [] 788 for name, value in self.params: 789 if value: 790 params.append('{}={}'.format(name, quote_string(value))) 791 else: 792 params.append(name) 793 params = '; '.join(params) 794 return ' ' + params if params else '' 795 796 797class ParameterizedHeaderValue(TokenList): 798 799 # Set this false so that the value doesn't wind up on a new line even 800 # if it and the parameters would fit there but not on the first line. 801 syntactic_break = False 802 803 @property 804 def params(self): 805 for token in reversed(self): 806 if token.token_type == 'mime-parameters': 807 return token.params 808 return {} 809 810 811class ContentType(ParameterizedHeaderValue): 812 813 token_type = 'content-type' 814 as_ew_allowed = False 815 maintype = 'text' 816 subtype = 'plain' 817 818 819class ContentDisposition(ParameterizedHeaderValue): 820 821 token_type = 'content-disposition' 822 as_ew_allowed = False 823 content_disposition = None 824 825 826class ContentTransferEncoding(TokenList): 827 828 token_type = 'content-transfer-encoding' 829 as_ew_allowed = False 830 cte = '7bit' 831 832 833class HeaderLabel(TokenList): 834 835 token_type = 'header-label' 836 as_ew_allowed = False 837 838 839class Header(TokenList): 840 841 token_type = 'header' 842 843 844# 845# Terminal classes and instances 846# 847 848class Terminal(str): 849 850 as_ew_allowed = True 851 ew_combine_allowed = True 852 syntactic_break = True 853 854 def __new__(cls, value, token_type): 855 self = super().__new__(cls, value) 856 self.token_type = token_type 857 self.defects = [] 858 return self 859 860 def __repr__(self): 861 return "{}({})".format(self.__class__.__name__, super().__repr__()) 862 863 def pprint(self): 864 print(self.__class__.__name__ + '/' + self.token_type) 865 866 @property 867 def all_defects(self): 868 return list(self.defects) 869 870 def _pp(self, indent=''): 871 return ["{}{}/{}({}){}".format( 872 indent, 873 self.__class__.__name__, 874 self.token_type, 875 super().__repr__(), 876 '' if not self.defects else ' {}'.format(self.defects), 877 )] 878 879 def pop_trailing_ws(self): 880 # This terminates the recursion. 881 return None 882 883 @property 884 def comments(self): 885 return [] 886 887 def __getnewargs__(self): 888 return(str(self), self.token_type) 889 890 891class WhiteSpaceTerminal(Terminal): 892 893 @property 894 def value(self): 895 return ' ' 896 897 def startswith_fws(self): 898 return True 899 900 901class ValueTerminal(Terminal): 902 903 @property 904 def value(self): 905 return self 906 907 def startswith_fws(self): 908 return False 909 910 911class EWWhiteSpaceTerminal(WhiteSpaceTerminal): 912 913 @property 914 def value(self): 915 return '' 916 917 def __str__(self): 918 return '' 919 920 921# XXX these need to become classes and used as instances so 922# that a program can't change them in a parse tree and screw 923# up other parse trees. Maybe should have tests for that, too. 924DOT = ValueTerminal('.', 'dot') 925ListSeparator = ValueTerminal(',', 'list-separator') 926RouteComponentMarker = ValueTerminal('@', 'route-component-marker') 927 928# 929# Parser 930# 931 932# Parse strings according to RFC822/2047/2822/5322 rules. 933# 934# This is a stateless parser. Each get_XXX function accepts a string and 935# returns either a Terminal or a TokenList representing the RFC object named 936# by the method and a string containing the remaining unparsed characters 937# from the input. Thus a parser method consumes the next syntactic construct 938# of a given type and returns a token representing the construct plus the 939# unparsed remainder of the input string. 940# 941# For example, if the first element of a structured header is a 'phrase', 942# then: 943# 944# phrase, value = get_phrase(value) 945# 946# returns the complete phrase from the start of the string value, plus any 947# characters left in the string after the phrase is removed. 948 949_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split 950_non_atom_end_matcher = re.compile(r"[^{}]+".format( 951 re.escape(''.join(ATOM_ENDS)))).match 952_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall 953_non_token_end_matcher = re.compile(r"[^{}]+".format( 954 re.escape(''.join(TOKEN_ENDS)))).match 955_non_attribute_end_matcher = re.compile(r"[^{}]+".format( 956 re.escape(''.join(ATTRIBUTE_ENDS)))).match 957_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( 958 re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match 959 960def _validate_xtext(xtext): 961 """If input token contains ASCII non-printables, register a defect.""" 962 963 non_printables = _non_printable_finder(xtext) 964 if non_printables: 965 xtext.defects.append(errors.NonPrintableDefect(non_printables)) 966 if utils._has_surrogates(xtext): 967 xtext.defects.append(errors.UndecodableBytesDefect( 968 "Non-ASCII characters found in header token")) 969 970def _get_ptext_to_endchars(value, endchars): 971 """Scan printables/quoted-pairs until endchars and return unquoted ptext. 972 973 This function turns a run of qcontent, ccontent-without-comments, or 974 dtext-with-quoted-printables into a single string by unquoting any 975 quoted printables. It returns the string, the remaining value, and 976 a flag that is True iff there were any quoted printables decoded. 977 978 """ 979 fragment, *remainder = _wsp_splitter(value, 1) 980 vchars = [] 981 escape = False 982 had_qp = False 983 for pos in range(len(fragment)): 984 if fragment[pos] == '\\': 985 if escape: 986 escape = False 987 had_qp = True 988 else: 989 escape = True 990 continue 991 if escape: 992 escape = False 993 elif fragment[pos] in endchars: 994 break 995 vchars.append(fragment[pos]) 996 else: 997 pos = pos + 1 998 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp 999 1000def get_fws(value): 1001 """FWS = 1*WSP 1002 1003 This isn't the RFC definition. We're using fws to represent tokens where 1004 folding can be done, but when we are parsing the *un*folding has already 1005 been done so we don't need to watch out for CRLF. 1006 1007 """ 1008 newvalue = value.lstrip() 1009 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') 1010 return fws, newvalue 1011 1012def get_encoded_word(value): 1013 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 1014 1015 """ 1016 ew = EncodedWord() 1017 if not value.startswith('=?'): 1018 raise errors.HeaderParseError( 1019 "expected encoded word but found {}".format(value)) 1020 tok, *remainder = value[2:].split('?=', 1) 1021 if tok == value[2:]: 1022 raise errors.HeaderParseError( 1023 "expected encoded word but found {}".format(value)) 1024 remstr = ''.join(remainder) 1025 if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits: 1026 # The ? after the CTE was followed by an encoded word escape (=XX). 1027 rest, *remainder = remstr.split('?=', 1) 1028 tok = tok + '?=' + rest 1029 if len(tok.split()) > 1: 1030 ew.defects.append(errors.InvalidHeaderDefect( 1031 "whitespace inside encoded word")) 1032 ew.cte = value 1033 value = ''.join(remainder) 1034 try: 1035 text, charset, lang, defects = _ew.decode('=?' + tok + '?=') 1036 except ValueError: 1037 raise errors.HeaderParseError( 1038 "encoded word format invalid: '{}'".format(ew.cte)) 1039 ew.charset = charset 1040 ew.lang = lang 1041 ew.defects.extend(defects) 1042 while text: 1043 if text[0] in WSP: 1044 token, text = get_fws(text) 1045 ew.append(token) 1046 continue 1047 chars, *remainder = _wsp_splitter(text, 1) 1048 vtext = ValueTerminal(chars, 'vtext') 1049 _validate_xtext(vtext) 1050 ew.append(vtext) 1051 text = ''.join(remainder) 1052 return ew, value 1053 1054def get_unstructured(value): 1055 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct 1056 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) 1057 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR 1058 1059 obs-NO-WS-CTL is control characters except WSP/CR/LF. 1060 1061 So, basically, we have printable runs, plus control characters or nulls in 1062 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the 1063 obsolete syntax in its specification, but requires whitespace on either 1064 side of the encoded words, I can see no reason to need to separate the 1065 non-printable-non-whitespace from the printable runs if they occur, so we 1066 parse this into xtext tokens separated by WSP tokens. 1067 1068 Because an 'unstructured' value must by definition constitute the entire 1069 value, this 'get' routine does not return a remaining value, only the 1070 parsed TokenList. 1071 1072 """ 1073 # XXX: but what about bare CR and LF? They might signal the start or 1074 # end of an encoded word. YAGNI for now, since our current parsers 1075 # will never send us strings with bare CR or LF. 1076 1077 unstructured = UnstructuredTokenList() 1078 while value: 1079 if value[0] in WSP: 1080 token, value = get_fws(value) 1081 unstructured.append(token) 1082 continue 1083 if value.startswith('=?'): 1084 try: 1085 token, value = get_encoded_word(value) 1086 except errors.HeaderParseError: 1087 # XXX: Need to figure out how to register defects when 1088 # appropriate here. 1089 pass 1090 else: 1091 have_ws = True 1092 if len(unstructured) > 0: 1093 if unstructured[-1].token_type != 'fws': 1094 unstructured.defects.append(errors.InvalidHeaderDefect( 1095 "missing whitespace before encoded word")) 1096 have_ws = False 1097 if have_ws and len(unstructured) > 1: 1098 if unstructured[-2].token_type == 'encoded-word': 1099 unstructured[-1] = EWWhiteSpaceTerminal( 1100 unstructured[-1], 'fws') 1101 unstructured.append(token) 1102 continue 1103 tok, *remainder = _wsp_splitter(value, 1) 1104 vtext = ValueTerminal(tok, 'vtext') 1105 _validate_xtext(vtext) 1106 unstructured.append(vtext) 1107 value = ''.join(remainder) 1108 return unstructured 1109 1110def get_qp_ctext(value): 1111 r"""ctext = <printable ascii except \ ( )> 1112 1113 This is not the RFC ctext, since we are handling nested comments in comment 1114 and unquoting quoted-pairs here. We allow anything except the '()' 1115 characters, but if we find any ASCII other than the RFC defined printable 1116 ASCII, a NonPrintableDefect is added to the token's defects list. Since 1117 quoted pairs are converted to their unquoted values, what is returned is 1118 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value 1119 is ' '. 1120 1121 """ 1122 ptext, value, _ = _get_ptext_to_endchars(value, '()') 1123 ptext = WhiteSpaceTerminal(ptext, 'ptext') 1124 _validate_xtext(ptext) 1125 return ptext, value 1126 1127def get_qcontent(value): 1128 """qcontent = qtext / quoted-pair 1129 1130 We allow anything except the DQUOTE character, but if we find any ASCII 1131 other than the RFC defined printable ASCII, a NonPrintableDefect is 1132 added to the token's defects list. Any quoted pairs are converted to their 1133 unquoted values, so what is returned is a 'ptext' token. In this case it 1134 is a ValueTerminal. 1135 1136 """ 1137 ptext, value, _ = _get_ptext_to_endchars(value, '"') 1138 ptext = ValueTerminal(ptext, 'ptext') 1139 _validate_xtext(ptext) 1140 return ptext, value 1141 1142def get_atext(value): 1143 """atext = <matches _atext_matcher> 1144 1145 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to 1146 the token's defects list if we find non-atext characters. 1147 """ 1148 m = _non_atom_end_matcher(value) 1149 if not m: 1150 raise errors.HeaderParseError( 1151 "expected atext but found '{}'".format(value)) 1152 atext = m.group() 1153 value = value[len(atext):] 1154 atext = ValueTerminal(atext, 'atext') 1155 _validate_xtext(atext) 1156 return atext, value 1157 1158def get_bare_quoted_string(value): 1159 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE 1160 1161 A quoted-string without the leading or trailing white space. Its 1162 value is the text between the quote marks, with whitespace 1163 preserved and quoted pairs decoded. 1164 """ 1165 if value[0] != '"': 1166 raise errors.HeaderParseError( 1167 "expected '\"' but found '{}'".format(value)) 1168 bare_quoted_string = BareQuotedString() 1169 value = value[1:] 1170 if value[0] == '"': 1171 token, value = get_qcontent(value) 1172 bare_quoted_string.append(token) 1173 while value and value[0] != '"': 1174 if value[0] in WSP: 1175 token, value = get_fws(value) 1176 elif value[:2] == '=?': 1177 try: 1178 token, value = get_encoded_word(value) 1179 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1180 "encoded word inside quoted string")) 1181 except errors.HeaderParseError: 1182 token, value = get_qcontent(value) 1183 else: 1184 token, value = get_qcontent(value) 1185 bare_quoted_string.append(token) 1186 if not value: 1187 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1188 "end of header inside quoted string")) 1189 return bare_quoted_string, value 1190 return bare_quoted_string, value[1:] 1191 1192def get_comment(value): 1193 """comment = "(" *([FWS] ccontent) [FWS] ")" 1194 ccontent = ctext / quoted-pair / comment 1195 1196 We handle nested comments here, and quoted-pair in our qp-ctext routine. 1197 """ 1198 if value and value[0] != '(': 1199 raise errors.HeaderParseError( 1200 "expected '(' but found '{}'".format(value)) 1201 comment = Comment() 1202 value = value[1:] 1203 while value and value[0] != ")": 1204 if value[0] in WSP: 1205 token, value = get_fws(value) 1206 elif value[0] == '(': 1207 token, value = get_comment(value) 1208 else: 1209 token, value = get_qp_ctext(value) 1210 comment.append(token) 1211 if not value: 1212 comment.defects.append(errors.InvalidHeaderDefect( 1213 "end of header inside comment")) 1214 return comment, value 1215 return comment, value[1:] 1216 1217def get_cfws(value): 1218 """CFWS = (1*([FWS] comment) [FWS]) / FWS 1219 1220 """ 1221 cfws = CFWSList() 1222 while value and value[0] in CFWS_LEADER: 1223 if value[0] in WSP: 1224 token, value = get_fws(value) 1225 else: 1226 token, value = get_comment(value) 1227 cfws.append(token) 1228 return cfws, value 1229 1230def get_quoted_string(value): 1231 """quoted-string = [CFWS] <bare-quoted-string> [CFWS] 1232 1233 'bare-quoted-string' is an intermediate class defined by this 1234 parser and not by the RFC grammar. It is the quoted string 1235 without any attached CFWS. 1236 """ 1237 quoted_string = QuotedString() 1238 if value and value[0] in CFWS_LEADER: 1239 token, value = get_cfws(value) 1240 quoted_string.append(token) 1241 token, value = get_bare_quoted_string(value) 1242 quoted_string.append(token) 1243 if value and value[0] in CFWS_LEADER: 1244 token, value = get_cfws(value) 1245 quoted_string.append(token) 1246 return quoted_string, value 1247 1248def get_atom(value): 1249 """atom = [CFWS] 1*atext [CFWS] 1250 1251 An atom could be an rfc2047 encoded word. 1252 """ 1253 atom = Atom() 1254 if value and value[0] in CFWS_LEADER: 1255 token, value = get_cfws(value) 1256 atom.append(token) 1257 if value and value[0] in ATOM_ENDS: 1258 raise errors.HeaderParseError( 1259 "expected atom but found '{}'".format(value)) 1260 if value.startswith('=?'): 1261 try: 1262 token, value = get_encoded_word(value) 1263 except errors.HeaderParseError: 1264 # XXX: need to figure out how to register defects when 1265 # appropriate here. 1266 token, value = get_atext(value) 1267 else: 1268 token, value = get_atext(value) 1269 atom.append(token) 1270 if value and value[0] in CFWS_LEADER: 1271 token, value = get_cfws(value) 1272 atom.append(token) 1273 return atom, value 1274 1275def get_dot_atom_text(value): 1276 """ dot-text = 1*atext *("." 1*atext) 1277 1278 """ 1279 dot_atom_text = DotAtomText() 1280 if not value or value[0] in ATOM_ENDS: 1281 raise errors.HeaderParseError("expected atom at a start of " 1282 "dot-atom-text but found '{}'".format(value)) 1283 while value and value[0] not in ATOM_ENDS: 1284 token, value = get_atext(value) 1285 dot_atom_text.append(token) 1286 if value and value[0] == '.': 1287 dot_atom_text.append(DOT) 1288 value = value[1:] 1289 if dot_atom_text[-1] is DOT: 1290 raise errors.HeaderParseError("expected atom at end of dot-atom-text " 1291 "but found '{}'".format('.'+value)) 1292 return dot_atom_text, value 1293 1294def get_dot_atom(value): 1295 """ dot-atom = [CFWS] dot-atom-text [CFWS] 1296 1297 Any place we can have a dot atom, we could instead have an rfc2047 encoded 1298 word. 1299 """ 1300 dot_atom = DotAtom() 1301 if value[0] in CFWS_LEADER: 1302 token, value = get_cfws(value) 1303 dot_atom.append(token) 1304 if value.startswith('=?'): 1305 try: 1306 token, value = get_encoded_word(value) 1307 except errors.HeaderParseError: 1308 # XXX: need to figure out how to register defects when 1309 # appropriate here. 1310 token, value = get_dot_atom_text(value) 1311 else: 1312 token, value = get_dot_atom_text(value) 1313 dot_atom.append(token) 1314 if value and value[0] in CFWS_LEADER: 1315 token, value = get_cfws(value) 1316 dot_atom.append(token) 1317 return dot_atom, value 1318 1319def get_word(value): 1320 """word = atom / quoted-string 1321 1322 Either atom or quoted-string may start with CFWS. We have to peel off this 1323 CFWS first to determine which type of word to parse. Afterward we splice 1324 the leading CFWS, if any, into the parsed sub-token. 1325 1326 If neither an atom or a quoted-string is found before the next special, a 1327 HeaderParseError is raised. 1328 1329 The token returned is either an Atom or a QuotedString, as appropriate. 1330 This means the 'word' level of the formal grammar is not represented in the 1331 parse tree; this is because having that extra layer when manipulating the 1332 parse tree is more confusing than it is helpful. 1333 1334 """ 1335 if value[0] in CFWS_LEADER: 1336 leader, value = get_cfws(value) 1337 else: 1338 leader = None 1339 if value[0]=='"': 1340 token, value = get_quoted_string(value) 1341 elif value[0] in SPECIALS: 1342 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " 1343 "but found '{}'".format(value)) 1344 else: 1345 token, value = get_atom(value) 1346 if leader is not None: 1347 token[:0] = [leader] 1348 return token, value 1349 1350def get_phrase(value): 1351 """ phrase = 1*word / obs-phrase 1352 obs-phrase = word *(word / "." / CFWS) 1353 1354 This means a phrase can be a sequence of words, periods, and CFWS in any 1355 order as long as it starts with at least one word. If anything other than 1356 words is detected, an ObsoleteHeaderDefect is added to the token's defect 1357 list. We also accept a phrase that starts with CFWS followed by a dot; 1358 this is registered as an InvalidHeaderDefect, since it is not supported by 1359 even the obsolete grammar. 1360 1361 """ 1362 phrase = Phrase() 1363 try: 1364 token, value = get_word(value) 1365 phrase.append(token) 1366 except errors.HeaderParseError: 1367 phrase.defects.append(errors.InvalidHeaderDefect( 1368 "phrase does not start with word")) 1369 while value and value[0] not in PHRASE_ENDS: 1370 if value[0]=='.': 1371 phrase.append(DOT) 1372 phrase.defects.append(errors.ObsoleteHeaderDefect( 1373 "period in 'phrase'")) 1374 value = value[1:] 1375 else: 1376 try: 1377 token, value = get_word(value) 1378 except errors.HeaderParseError: 1379 if value[0] in CFWS_LEADER: 1380 token, value = get_cfws(value) 1381 phrase.defects.append(errors.ObsoleteHeaderDefect( 1382 "comment found without atom")) 1383 else: 1384 raise 1385 phrase.append(token) 1386 return phrase, value 1387 1388def get_local_part(value): 1389 """ local-part = dot-atom / quoted-string / obs-local-part 1390 1391 """ 1392 local_part = LocalPart() 1393 leader = None 1394 if value[0] in CFWS_LEADER: 1395 leader, value = get_cfws(value) 1396 if not value: 1397 raise errors.HeaderParseError( 1398 "expected local-part but found '{}'".format(value)) 1399 try: 1400 token, value = get_dot_atom(value) 1401 except errors.HeaderParseError: 1402 try: 1403 token, value = get_word(value) 1404 except errors.HeaderParseError: 1405 if value[0] != '\\' and value[0] in PHRASE_ENDS: 1406 raise 1407 token = TokenList() 1408 if leader is not None: 1409 token[:0] = [leader] 1410 local_part.append(token) 1411 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1412 obs_local_part, value = get_obs_local_part(str(local_part) + value) 1413 if obs_local_part.token_type == 'invalid-obs-local-part': 1414 local_part.defects.append(errors.InvalidHeaderDefect( 1415 "local-part is not dot-atom, quoted-string, or obs-local-part")) 1416 else: 1417 local_part.defects.append(errors.ObsoleteHeaderDefect( 1418 "local-part is not a dot-atom (contains CFWS)")) 1419 local_part[0] = obs_local_part 1420 try: 1421 local_part.value.encode('ascii') 1422 except UnicodeEncodeError: 1423 local_part.defects.append(errors.NonASCIILocalPartDefect( 1424 "local-part contains non-ASCII characters)")) 1425 return local_part, value 1426 1427def get_obs_local_part(value): 1428 """ obs-local-part = word *("." word) 1429 """ 1430 obs_local_part = ObsLocalPart() 1431 last_non_ws_was_dot = False 1432 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1433 if value[0] == '.': 1434 if last_non_ws_was_dot: 1435 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1436 "invalid repeated '.'")) 1437 obs_local_part.append(DOT) 1438 last_non_ws_was_dot = True 1439 value = value[1:] 1440 continue 1441 elif value[0]=='\\': 1442 obs_local_part.append(ValueTerminal(value[0], 1443 'misplaced-special')) 1444 value = value[1:] 1445 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1446 "'\\' character outside of quoted-string/ccontent")) 1447 last_non_ws_was_dot = False 1448 continue 1449 if obs_local_part and obs_local_part[-1].token_type != 'dot': 1450 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1451 "missing '.' between words")) 1452 try: 1453 token, value = get_word(value) 1454 last_non_ws_was_dot = False 1455 except errors.HeaderParseError: 1456 if value[0] not in CFWS_LEADER: 1457 raise 1458 token, value = get_cfws(value) 1459 obs_local_part.append(token) 1460 if (obs_local_part[0].token_type == 'dot' or 1461 obs_local_part[0].token_type=='cfws' and 1462 obs_local_part[1].token_type=='dot'): 1463 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1464 "Invalid leading '.' in local part")) 1465 if (obs_local_part[-1].token_type == 'dot' or 1466 obs_local_part[-1].token_type=='cfws' and 1467 obs_local_part[-2].token_type=='dot'): 1468 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1469 "Invalid trailing '.' in local part")) 1470 if obs_local_part.defects: 1471 obs_local_part.token_type = 'invalid-obs-local-part' 1472 return obs_local_part, value 1473 1474def get_dtext(value): 1475 r""" dtext = <printable ascii except \ [ ]> / obs-dtext 1476 obs-dtext = obs-NO-WS-CTL / quoted-pair 1477 1478 We allow anything except the excluded characters, but if we find any 1479 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is 1480 added to the token's defects list. Quoted pairs are converted to their 1481 unquoted values, so what is returned is a ptext token, in this case a 1482 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is 1483 added to the returned token's defect list. 1484 1485 """ 1486 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') 1487 ptext = ValueTerminal(ptext, 'ptext') 1488 if had_qp: 1489 ptext.defects.append(errors.ObsoleteHeaderDefect( 1490 "quoted printable found in domain-literal")) 1491 _validate_xtext(ptext) 1492 return ptext, value 1493 1494def _check_for_early_dl_end(value, domain_literal): 1495 if value: 1496 return False 1497 domain_literal.append(errors.InvalidHeaderDefect( 1498 "end of input inside domain-literal")) 1499 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1500 return True 1501 1502def get_domain_literal(value): 1503 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] 1504 1505 """ 1506 domain_literal = DomainLiteral() 1507 if value[0] in CFWS_LEADER: 1508 token, value = get_cfws(value) 1509 domain_literal.append(token) 1510 if not value: 1511 raise errors.HeaderParseError("expected domain-literal") 1512 if value[0] != '[': 1513 raise errors.HeaderParseError("expected '[' at start of domain-literal " 1514 "but found '{}'".format(value)) 1515 value = value[1:] 1516 if _check_for_early_dl_end(value, domain_literal): 1517 return domain_literal, value 1518 domain_literal.append(ValueTerminal('[', 'domain-literal-start')) 1519 if value[0] in WSP: 1520 token, value = get_fws(value) 1521 domain_literal.append(token) 1522 token, value = get_dtext(value) 1523 domain_literal.append(token) 1524 if _check_for_early_dl_end(value, domain_literal): 1525 return domain_literal, value 1526 if value[0] in WSP: 1527 token, value = get_fws(value) 1528 domain_literal.append(token) 1529 if _check_for_early_dl_end(value, domain_literal): 1530 return domain_literal, value 1531 if value[0] != ']': 1532 raise errors.HeaderParseError("expected ']' at end of domain-literal " 1533 "but found '{}'".format(value)) 1534 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1535 value = value[1:] 1536 if value and value[0] in CFWS_LEADER: 1537 token, value = get_cfws(value) 1538 domain_literal.append(token) 1539 return domain_literal, value 1540 1541def get_domain(value): 1542 """ domain = dot-atom / domain-literal / obs-domain 1543 obs-domain = atom *("." atom)) 1544 1545 """ 1546 domain = Domain() 1547 leader = None 1548 if value[0] in CFWS_LEADER: 1549 leader, value = get_cfws(value) 1550 if not value: 1551 raise errors.HeaderParseError( 1552 "expected domain but found '{}'".format(value)) 1553 if value[0] == '[': 1554 token, value = get_domain_literal(value) 1555 if leader is not None: 1556 token[:0] = [leader] 1557 domain.append(token) 1558 return domain, value 1559 try: 1560 token, value = get_dot_atom(value) 1561 except errors.HeaderParseError: 1562 token, value = get_atom(value) 1563 if leader is not None: 1564 token[:0] = [leader] 1565 domain.append(token) 1566 if value and value[0] == '.': 1567 domain.defects.append(errors.ObsoleteHeaderDefect( 1568 "domain is not a dot-atom (contains CFWS)")) 1569 if domain[0].token_type == 'dot-atom': 1570 domain[:] = domain[0] 1571 while value and value[0] == '.': 1572 domain.append(DOT) 1573 token, value = get_atom(value[1:]) 1574 domain.append(token) 1575 return domain, value 1576 1577def get_addr_spec(value): 1578 """ addr-spec = local-part "@" domain 1579 1580 """ 1581 addr_spec = AddrSpec() 1582 token, value = get_local_part(value) 1583 addr_spec.append(token) 1584 if not value or value[0] != '@': 1585 addr_spec.defects.append(errors.InvalidHeaderDefect( 1586 "add-spec local part with no domain")) 1587 return addr_spec, value 1588 addr_spec.append(ValueTerminal('@', 'address-at-symbol')) 1589 token, value = get_domain(value[1:]) 1590 addr_spec.append(token) 1591 return addr_spec, value 1592 1593def get_obs_route(value): 1594 """ obs-route = obs-domain-list ":" 1595 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) 1596 1597 Returns an obs-route token with the appropriate sub-tokens (that is, 1598 there is no obs-domain-list in the parse tree). 1599 """ 1600 obs_route = ObsRoute() 1601 while value and (value[0]==',' or value[0] in CFWS_LEADER): 1602 if value[0] in CFWS_LEADER: 1603 token, value = get_cfws(value) 1604 obs_route.append(token) 1605 elif value[0] == ',': 1606 obs_route.append(ListSeparator) 1607 value = value[1:] 1608 if not value or value[0] != '@': 1609 raise errors.HeaderParseError( 1610 "expected obs-route domain but found '{}'".format(value)) 1611 obs_route.append(RouteComponentMarker) 1612 token, value = get_domain(value[1:]) 1613 obs_route.append(token) 1614 while value and value[0]==',': 1615 obs_route.append(ListSeparator) 1616 value = value[1:] 1617 if not value: 1618 break 1619 if value[0] in CFWS_LEADER: 1620 token, value = get_cfws(value) 1621 obs_route.append(token) 1622 if value[0] == '@': 1623 obs_route.append(RouteComponentMarker) 1624 token, value = get_domain(value[1:]) 1625 obs_route.append(token) 1626 if not value: 1627 raise errors.HeaderParseError("end of header while parsing obs-route") 1628 if value[0] != ':': 1629 raise errors.HeaderParseError( "expected ':' marking end of " 1630 "obs-route but found '{}'".format(value)) 1631 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) 1632 return obs_route, value[1:] 1633 1634def get_angle_addr(value): 1635 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr 1636 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] 1637 1638 """ 1639 angle_addr = AngleAddr() 1640 if value[0] in CFWS_LEADER: 1641 token, value = get_cfws(value) 1642 angle_addr.append(token) 1643 if not value or value[0] != '<': 1644 raise errors.HeaderParseError( 1645 "expected angle-addr but found '{}'".format(value)) 1646 angle_addr.append(ValueTerminal('<', 'angle-addr-start')) 1647 value = value[1:] 1648 # Although it is not legal per RFC5322, SMTP uses '<>' in certain 1649 # circumstances. 1650 if value[0] == '>': 1651 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 1652 angle_addr.defects.append(errors.InvalidHeaderDefect( 1653 "null addr-spec in angle-addr")) 1654 value = value[1:] 1655 return angle_addr, value 1656 try: 1657 token, value = get_addr_spec(value) 1658 except errors.HeaderParseError: 1659 try: 1660 token, value = get_obs_route(value) 1661 angle_addr.defects.append(errors.ObsoleteHeaderDefect( 1662 "obsolete route specification in angle-addr")) 1663 except errors.HeaderParseError: 1664 raise errors.HeaderParseError( 1665 "expected addr-spec or obs-route but found '{}'".format(value)) 1666 angle_addr.append(token) 1667 token, value = get_addr_spec(value) 1668 angle_addr.append(token) 1669 if value and value[0] == '>': 1670 value = value[1:] 1671 else: 1672 angle_addr.defects.append(errors.InvalidHeaderDefect( 1673 "missing trailing '>' on angle-addr")) 1674 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 1675 if value and value[0] in CFWS_LEADER: 1676 token, value = get_cfws(value) 1677 angle_addr.append(token) 1678 return angle_addr, value 1679 1680def get_display_name(value): 1681 """ display-name = phrase 1682 1683 Because this is simply a name-rule, we don't return a display-name 1684 token containing a phrase, but rather a display-name token with 1685 the content of the phrase. 1686 1687 """ 1688 display_name = DisplayName() 1689 token, value = get_phrase(value) 1690 display_name.extend(token[:]) 1691 display_name.defects = token.defects[:] 1692 return display_name, value 1693 1694 1695def get_name_addr(value): 1696 """ name-addr = [display-name] angle-addr 1697 1698 """ 1699 name_addr = NameAddr() 1700 # Both the optional display name and the angle-addr can start with cfws. 1701 leader = None 1702 if value[0] in CFWS_LEADER: 1703 leader, value = get_cfws(value) 1704 if not value: 1705 raise errors.HeaderParseError( 1706 "expected name-addr but found '{}'".format(leader)) 1707 if value[0] != '<': 1708 if value[0] in PHRASE_ENDS: 1709 raise errors.HeaderParseError( 1710 "expected name-addr but found '{}'".format(value)) 1711 token, value = get_display_name(value) 1712 if not value: 1713 raise errors.HeaderParseError( 1714 "expected name-addr but found '{}'".format(token)) 1715 if leader is not None: 1716 token[0][:0] = [leader] 1717 leader = None 1718 name_addr.append(token) 1719 token, value = get_angle_addr(value) 1720 if leader is not None: 1721 token[:0] = [leader] 1722 name_addr.append(token) 1723 return name_addr, value 1724 1725def get_mailbox(value): 1726 """ mailbox = name-addr / addr-spec 1727 1728 """ 1729 # The only way to figure out if we are dealing with a name-addr or an 1730 # addr-spec is to try parsing each one. 1731 mailbox = Mailbox() 1732 try: 1733 token, value = get_name_addr(value) 1734 except errors.HeaderParseError: 1735 try: 1736 token, value = get_addr_spec(value) 1737 except errors.HeaderParseError: 1738 raise errors.HeaderParseError( 1739 "expected mailbox but found '{}'".format(value)) 1740 if any(isinstance(x, errors.InvalidHeaderDefect) 1741 for x in token.all_defects): 1742 mailbox.token_type = 'invalid-mailbox' 1743 mailbox.append(token) 1744 return mailbox, value 1745 1746def get_invalid_mailbox(value, endchars): 1747 """ Read everything up to one of the chars in endchars. 1748 1749 This is outside the formal grammar. The InvalidMailbox TokenList that is 1750 returned acts like a Mailbox, but the data attributes are None. 1751 1752 """ 1753 invalid_mailbox = InvalidMailbox() 1754 while value and value[0] not in endchars: 1755 if value[0] in PHRASE_ENDS: 1756 invalid_mailbox.append(ValueTerminal(value[0], 1757 'misplaced-special')) 1758 value = value[1:] 1759 else: 1760 token, value = get_phrase(value) 1761 invalid_mailbox.append(token) 1762 return invalid_mailbox, value 1763 1764def get_mailbox_list(value): 1765 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list 1766 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) 1767 1768 For this routine we go outside the formal grammar in order to improve error 1769 handling. We recognize the end of the mailbox list only at the end of the 1770 value or at a ';' (the group terminator). This is so that we can turn 1771 invalid mailboxes into InvalidMailbox tokens and continue parsing any 1772 remaining valid mailboxes. We also allow all mailbox entries to be null, 1773 and this condition is handled appropriately at a higher level. 1774 1775 """ 1776 mailbox_list = MailboxList() 1777 while value and value[0] != ';': 1778 try: 1779 token, value = get_mailbox(value) 1780 mailbox_list.append(token) 1781 except errors.HeaderParseError: 1782 leader = None 1783 if value[0] in CFWS_LEADER: 1784 leader, value = get_cfws(value) 1785 if not value or value[0] in ',;': 1786 mailbox_list.append(leader) 1787 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 1788 "empty element in mailbox-list")) 1789 else: 1790 token, value = get_invalid_mailbox(value, ',;') 1791 if leader is not None: 1792 token[:0] = [leader] 1793 mailbox_list.append(token) 1794 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1795 "invalid mailbox in mailbox-list")) 1796 elif value[0] == ',': 1797 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 1798 "empty element in mailbox-list")) 1799 else: 1800 token, value = get_invalid_mailbox(value, ',;') 1801 if leader is not None: 1802 token[:0] = [leader] 1803 mailbox_list.append(token) 1804 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1805 "invalid mailbox in mailbox-list")) 1806 if value and value[0] not in ',;': 1807 # Crap after mailbox; treat it as an invalid mailbox. 1808 # The mailbox info will still be available. 1809 mailbox = mailbox_list[-1] 1810 mailbox.token_type = 'invalid-mailbox' 1811 token, value = get_invalid_mailbox(value, ',;') 1812 mailbox.extend(token) 1813 mailbox_list.defects.append(errors.InvalidHeaderDefect( 1814 "invalid mailbox in mailbox-list")) 1815 if value and value[0] == ',': 1816 mailbox_list.append(ListSeparator) 1817 value = value[1:] 1818 return mailbox_list, value 1819 1820 1821def get_group_list(value): 1822 """ group-list = mailbox-list / CFWS / obs-group-list 1823 obs-group-list = 1*([CFWS] ",") [CFWS] 1824 1825 """ 1826 group_list = GroupList() 1827 if not value: 1828 group_list.defects.append(errors.InvalidHeaderDefect( 1829 "end of header before group-list")) 1830 return group_list, value 1831 leader = None 1832 if value and value[0] in CFWS_LEADER: 1833 leader, value = get_cfws(value) 1834 if not value: 1835 # This should never happen in email parsing, since CFWS-only is a 1836 # legal alternative to group-list in a group, which is the only 1837 # place group-list appears. 1838 group_list.defects.append(errors.InvalidHeaderDefect( 1839 "end of header in group-list")) 1840 group_list.append(leader) 1841 return group_list, value 1842 if value[0] == ';': 1843 group_list.append(leader) 1844 return group_list, value 1845 token, value = get_mailbox_list(value) 1846 if len(token.all_mailboxes)==0: 1847 if leader is not None: 1848 group_list.append(leader) 1849 group_list.extend(token) 1850 group_list.defects.append(errors.ObsoleteHeaderDefect( 1851 "group-list with empty entries")) 1852 return group_list, value 1853 if leader is not None: 1854 token[:0] = [leader] 1855 group_list.append(token) 1856 return group_list, value 1857 1858def get_group(value): 1859 """ group = display-name ":" [group-list] ";" [CFWS] 1860 1861 """ 1862 group = Group() 1863 token, value = get_display_name(value) 1864 if not value or value[0] != ':': 1865 raise errors.HeaderParseError("expected ':' at end of group " 1866 "display name but found '{}'".format(value)) 1867 group.append(token) 1868 group.append(ValueTerminal(':', 'group-display-name-terminator')) 1869 value = value[1:] 1870 if value and value[0] == ';': 1871 group.append(ValueTerminal(';', 'group-terminator')) 1872 return group, value[1:] 1873 token, value = get_group_list(value) 1874 group.append(token) 1875 if not value: 1876 group.defects.append(errors.InvalidHeaderDefect( 1877 "end of header in group")) 1878 elif value[0] != ';': 1879 raise errors.HeaderParseError( 1880 "expected ';' at end of group but found {}".format(value)) 1881 group.append(ValueTerminal(';', 'group-terminator')) 1882 value = value[1:] 1883 if value and value[0] in CFWS_LEADER: 1884 token, value = get_cfws(value) 1885 group.append(token) 1886 return group, value 1887 1888def get_address(value): 1889 """ address = mailbox / group 1890 1891 Note that counter-intuitively, an address can be either a single address or 1892 a list of addresses (a group). This is why the returned Address object has 1893 a 'mailboxes' attribute which treats a single address as a list of length 1894 one. When you need to differentiate between to two cases, extract the single 1895 element, which is either a mailbox or a group token. 1896 1897 """ 1898 # The formal grammar isn't very helpful when parsing an address. mailbox 1899 # and group, especially when allowing for obsolete forms, start off very 1900 # similarly. It is only when you reach one of @, <, or : that you know 1901 # what you've got. So, we try each one in turn, starting with the more 1902 # likely of the two. We could perhaps make this more efficient by looking 1903 # for a phrase and then branching based on the next character, but that 1904 # would be a premature optimization. 1905 address = Address() 1906 try: 1907 token, value = get_group(value) 1908 except errors.HeaderParseError: 1909 try: 1910 token, value = get_mailbox(value) 1911 except errors.HeaderParseError: 1912 raise errors.HeaderParseError( 1913 "expected address but found '{}'".format(value)) 1914 address.append(token) 1915 return address, value 1916 1917def get_address_list(value): 1918 """ address_list = (address *("," address)) / obs-addr-list 1919 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) 1920 1921 We depart from the formal grammar here by continuing to parse until the end 1922 of the input, assuming the input to be entirely composed of an 1923 address-list. This is always true in email parsing, and allows us 1924 to skip invalid addresses to parse additional valid ones. 1925 1926 """ 1927 address_list = AddressList() 1928 while value: 1929 try: 1930 token, value = get_address(value) 1931 address_list.append(token) 1932 except errors.HeaderParseError as err: 1933 leader = None 1934 if value[0] in CFWS_LEADER: 1935 leader, value = get_cfws(value) 1936 if not value or value[0] == ',': 1937 address_list.append(leader) 1938 address_list.defects.append(errors.ObsoleteHeaderDefect( 1939 "address-list entry with no content")) 1940 else: 1941 token, value = get_invalid_mailbox(value, ',') 1942 if leader is not None: 1943 token[:0] = [leader] 1944 address_list.append(Address([token])) 1945 address_list.defects.append(errors.InvalidHeaderDefect( 1946 "invalid address in address-list")) 1947 elif value[0] == ',': 1948 address_list.defects.append(errors.ObsoleteHeaderDefect( 1949 "empty element in address-list")) 1950 else: 1951 token, value = get_invalid_mailbox(value, ',') 1952 if leader is not None: 1953 token[:0] = [leader] 1954 address_list.append(Address([token])) 1955 address_list.defects.append(errors.InvalidHeaderDefect( 1956 "invalid address in address-list")) 1957 if value and value[0] != ',': 1958 # Crap after address; treat it as an invalid mailbox. 1959 # The mailbox info will still be available. 1960 mailbox = address_list[-1][0] 1961 mailbox.token_type = 'invalid-mailbox' 1962 token, value = get_invalid_mailbox(value, ',') 1963 mailbox.extend(token) 1964 address_list.defects.append(errors.InvalidHeaderDefect( 1965 "invalid address in address-list")) 1966 if value: # Must be a , at this point. 1967 address_list.append(ValueTerminal(',', 'list-separator')) 1968 value = value[1:] 1969 return address_list, value 1970 1971# 1972# XXX: As I begin to add additional header parsers, I'm realizing we probably 1973# have two level of parser routines: the get_XXX methods that get a token in 1974# the grammar, and parse_XXX methods that parse an entire field value. So 1975# get_address_list above should really be a parse_ method, as probably should 1976# be get_unstructured. 1977# 1978 1979def parse_mime_version(value): 1980 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] 1981 1982 """ 1983 # The [CFWS] is implicit in the RFC 2045 BNF. 1984 # XXX: This routine is a bit verbose, should factor out a get_int method. 1985 mime_version = MIMEVersion() 1986 if not value: 1987 mime_version.defects.append(errors.HeaderMissingRequiredValue( 1988 "Missing MIME version number (eg: 1.0)")) 1989 return mime_version 1990 if value[0] in CFWS_LEADER: 1991 token, value = get_cfws(value) 1992 mime_version.append(token) 1993 if not value: 1994 mime_version.defects.append(errors.HeaderMissingRequiredValue( 1995 "Expected MIME version number but found only CFWS")) 1996 digits = '' 1997 while value and value[0] != '.' and value[0] not in CFWS_LEADER: 1998 digits += value[0] 1999 value = value[1:] 2000 if not digits.isdigit(): 2001 mime_version.defects.append(errors.InvalidHeaderDefect( 2002 "Expected MIME major version number but found {!r}".format(digits))) 2003 mime_version.append(ValueTerminal(digits, 'xtext')) 2004 else: 2005 mime_version.major = int(digits) 2006 mime_version.append(ValueTerminal(digits, 'digits')) 2007 if value and value[0] in CFWS_LEADER: 2008 token, value = get_cfws(value) 2009 mime_version.append(token) 2010 if not value or value[0] != '.': 2011 if mime_version.major is not None: 2012 mime_version.defects.append(errors.InvalidHeaderDefect( 2013 "Incomplete MIME version; found only major number")) 2014 if value: 2015 mime_version.append(ValueTerminal(value, 'xtext')) 2016 return mime_version 2017 mime_version.append(ValueTerminal('.', 'version-separator')) 2018 value = value[1:] 2019 if value and value[0] in CFWS_LEADER: 2020 token, value = get_cfws(value) 2021 mime_version.append(token) 2022 if not value: 2023 if mime_version.major is not None: 2024 mime_version.defects.append(errors.InvalidHeaderDefect( 2025 "Incomplete MIME version; found only major number")) 2026 return mime_version 2027 digits = '' 2028 while value and value[0] not in CFWS_LEADER: 2029 digits += value[0] 2030 value = value[1:] 2031 if not digits.isdigit(): 2032 mime_version.defects.append(errors.InvalidHeaderDefect( 2033 "Expected MIME minor version number but found {!r}".format(digits))) 2034 mime_version.append(ValueTerminal(digits, 'xtext')) 2035 else: 2036 mime_version.minor = int(digits) 2037 mime_version.append(ValueTerminal(digits, 'digits')) 2038 if value and value[0] in CFWS_LEADER: 2039 token, value = get_cfws(value) 2040 mime_version.append(token) 2041 if value: 2042 mime_version.defects.append(errors.InvalidHeaderDefect( 2043 "Excess non-CFWS text after MIME version")) 2044 mime_version.append(ValueTerminal(value, 'xtext')) 2045 return mime_version 2046 2047def get_invalid_parameter(value): 2048 """ Read everything up to the next ';'. 2049 2050 This is outside the formal grammar. The InvalidParameter TokenList that is 2051 returned acts like a Parameter, but the data attributes are None. 2052 2053 """ 2054 invalid_parameter = InvalidParameter() 2055 while value and value[0] != ';': 2056 if value[0] in PHRASE_ENDS: 2057 invalid_parameter.append(ValueTerminal(value[0], 2058 'misplaced-special')) 2059 value = value[1:] 2060 else: 2061 token, value = get_phrase(value) 2062 invalid_parameter.append(token) 2063 return invalid_parameter, value 2064 2065def get_ttext(value): 2066 """ttext = <matches _ttext_matcher> 2067 2068 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's 2069 defects list if we find non-ttext characters. We also register defects for 2070 *any* non-printables even though the RFC doesn't exclude all of them, 2071 because we follow the spirit of RFC 5322. 2072 2073 """ 2074 m = _non_token_end_matcher(value) 2075 if not m: 2076 raise errors.HeaderParseError( 2077 "expected ttext but found '{}'".format(value)) 2078 ttext = m.group() 2079 value = value[len(ttext):] 2080 ttext = ValueTerminal(ttext, 'ttext') 2081 _validate_xtext(ttext) 2082 return ttext, value 2083 2084def get_token(value): 2085 """token = [CFWS] 1*ttext [CFWS] 2086 2087 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or 2088 tspecials. We also exclude tabs even though the RFC doesn't. 2089 2090 The RFC implies the CFWS but is not explicit about it in the BNF. 2091 2092 """ 2093 mtoken = Token() 2094 if value and value[0] in CFWS_LEADER: 2095 token, value = get_cfws(value) 2096 mtoken.append(token) 2097 if value and value[0] in TOKEN_ENDS: 2098 raise errors.HeaderParseError( 2099 "expected token but found '{}'".format(value)) 2100 token, value = get_ttext(value) 2101 mtoken.append(token) 2102 if value and value[0] in CFWS_LEADER: 2103 token, value = get_cfws(value) 2104 mtoken.append(token) 2105 return mtoken, value 2106 2107def get_attrtext(value): 2108 """attrtext = 1*(any non-ATTRIBUTE_ENDS character) 2109 2110 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the 2111 token's defects list if we find non-attrtext characters. We also register 2112 defects for *any* non-printables even though the RFC doesn't exclude all of 2113 them, because we follow the spirit of RFC 5322. 2114 2115 """ 2116 m = _non_attribute_end_matcher(value) 2117 if not m: 2118 raise errors.HeaderParseError( 2119 "expected attrtext but found {!r}".format(value)) 2120 attrtext = m.group() 2121 value = value[len(attrtext):] 2122 attrtext = ValueTerminal(attrtext, 'attrtext') 2123 _validate_xtext(attrtext) 2124 return attrtext, value 2125 2126def get_attribute(value): 2127 """ [CFWS] 1*attrtext [CFWS] 2128 2129 This version of the BNF makes the CFWS explicit, and as usual we use a 2130 value terminal for the actual run of characters. The RFC equivalent of 2131 attrtext is the token characters, with the subtraction of '*', "'", and '%'. 2132 We include tab in the excluded set just as we do for token. 2133 2134 """ 2135 attribute = Attribute() 2136 if value and value[0] in CFWS_LEADER: 2137 token, value = get_cfws(value) 2138 attribute.append(token) 2139 if value and value[0] in ATTRIBUTE_ENDS: 2140 raise errors.HeaderParseError( 2141 "expected token but found '{}'".format(value)) 2142 token, value = get_attrtext(value) 2143 attribute.append(token) 2144 if value and value[0] in CFWS_LEADER: 2145 token, value = get_cfws(value) 2146 attribute.append(token) 2147 return attribute, value 2148 2149def get_extended_attrtext(value): 2150 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') 2151 2152 This is a special parsing routine so that we get a value that 2153 includes % escapes as a single string (which we decode as a single 2154 string later). 2155 2156 """ 2157 m = _non_extended_attribute_end_matcher(value) 2158 if not m: 2159 raise errors.HeaderParseError( 2160 "expected extended attrtext but found {!r}".format(value)) 2161 attrtext = m.group() 2162 value = value[len(attrtext):] 2163 attrtext = ValueTerminal(attrtext, 'extended-attrtext') 2164 _validate_xtext(attrtext) 2165 return attrtext, value 2166 2167def get_extended_attribute(value): 2168 """ [CFWS] 1*extended_attrtext [CFWS] 2169 2170 This is like the non-extended version except we allow % characters, so that 2171 we can pick up an encoded value as a single string. 2172 2173 """ 2174 # XXX: should we have an ExtendedAttribute TokenList? 2175 attribute = Attribute() 2176 if value and value[0] in CFWS_LEADER: 2177 token, value = get_cfws(value) 2178 attribute.append(token) 2179 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: 2180 raise errors.HeaderParseError( 2181 "expected token but found '{}'".format(value)) 2182 token, value = get_extended_attrtext(value) 2183 attribute.append(token) 2184 if value and value[0] in CFWS_LEADER: 2185 token, value = get_cfws(value) 2186 attribute.append(token) 2187 return attribute, value 2188 2189def get_section(value): 2190 """ '*' digits 2191 2192 The formal BNF is more complicated because leading 0s are not allowed. We 2193 check for that and add a defect. We also assume no CFWS is allowed between 2194 the '*' and the digits, though the RFC is not crystal clear on that. 2195 The caller should already have dealt with leading CFWS. 2196 2197 """ 2198 section = Section() 2199 if not value or value[0] != '*': 2200 raise errors.HeaderParseError("Expected section but found {}".format( 2201 value)) 2202 section.append(ValueTerminal('*', 'section-marker')) 2203 value = value[1:] 2204 if not value or not value[0].isdigit(): 2205 raise errors.HeaderParseError("Expected section number but " 2206 "found {}".format(value)) 2207 digits = '' 2208 while value and value[0].isdigit(): 2209 digits += value[0] 2210 value = value[1:] 2211 if digits[0] == '0' and digits != '0': 2212 section.defects.append(errors.InvalidHeaderError( 2213 "section number has an invalid leading 0")) 2214 section.number = int(digits) 2215 section.append(ValueTerminal(digits, 'digits')) 2216 return section, value 2217 2218 2219def get_value(value): 2220 """ quoted-string / attribute 2221 2222 """ 2223 v = Value() 2224 if not value: 2225 raise errors.HeaderParseError("Expected value but found end of string") 2226 leader = None 2227 if value[0] in CFWS_LEADER: 2228 leader, value = get_cfws(value) 2229 if not value: 2230 raise errors.HeaderParseError("Expected value but found " 2231 "only {}".format(leader)) 2232 if value[0] == '"': 2233 token, value = get_quoted_string(value) 2234 else: 2235 token, value = get_extended_attribute(value) 2236 if leader is not None: 2237 token[:0] = [leader] 2238 v.append(token) 2239 return v, value 2240 2241def get_parameter(value): 2242 """ attribute [section] ["*"] [CFWS] "=" value 2243 2244 The CFWS is implied by the RFC but not made explicit in the BNF. This 2245 simplified form of the BNF from the RFC is made to conform with the RFC BNF 2246 through some extra checks. We do it this way because it makes both error 2247 recovery and working with the resulting parse tree easier. 2248 """ 2249 # It is possible CFWS would also be implicitly allowed between the section 2250 # and the 'extended-attribute' marker (the '*') , but we've never seen that 2251 # in the wild and we will therefore ignore the possibility. 2252 param = Parameter() 2253 token, value = get_attribute(value) 2254 param.append(token) 2255 if not value or value[0] == ';': 2256 param.defects.append(errors.InvalidHeaderDefect("Parameter contains " 2257 "name ({}) but no value".format(token))) 2258 return param, value 2259 if value[0] == '*': 2260 try: 2261 token, value = get_section(value) 2262 param.sectioned = True 2263 param.append(token) 2264 except errors.HeaderParseError: 2265 pass 2266 if not value: 2267 raise errors.HeaderParseError("Incomplete parameter") 2268 if value[0] == '*': 2269 param.append(ValueTerminal('*', 'extended-parameter-marker')) 2270 value = value[1:] 2271 param.extended = True 2272 if value[0] != '=': 2273 raise errors.HeaderParseError("Parameter not followed by '='") 2274 param.append(ValueTerminal('=', 'parameter-separator')) 2275 value = value[1:] 2276 leader = None 2277 if value and value[0] in CFWS_LEADER: 2278 token, value = get_cfws(value) 2279 param.append(token) 2280 remainder = None 2281 appendto = param 2282 if param.extended and value and value[0] == '"': 2283 # Now for some serious hackery to handle the common invalid case of 2284 # double quotes around an extended value. We also accept (with defect) 2285 # a value marked as encoded that isn't really. 2286 qstring, remainder = get_quoted_string(value) 2287 inner_value = qstring.stripped_value 2288 semi_valid = False 2289 if param.section_number == 0: 2290 if inner_value and inner_value[0] == "'": 2291 semi_valid = True 2292 else: 2293 token, rest = get_attrtext(inner_value) 2294 if rest and rest[0] == "'": 2295 semi_valid = True 2296 else: 2297 try: 2298 token, rest = get_extended_attrtext(inner_value) 2299 except: 2300 pass 2301 else: 2302 if not rest: 2303 semi_valid = True 2304 if semi_valid: 2305 param.defects.append(errors.InvalidHeaderDefect( 2306 "Quoted string value for extended parameter is invalid")) 2307 param.append(qstring) 2308 for t in qstring: 2309 if t.token_type == 'bare-quoted-string': 2310 t[:] = [] 2311 appendto = t 2312 break 2313 value = inner_value 2314 else: 2315 remainder = None 2316 param.defects.append(errors.InvalidHeaderDefect( 2317 "Parameter marked as extended but appears to have a " 2318 "quoted string value that is non-encoded")) 2319 if value and value[0] == "'": 2320 token = None 2321 else: 2322 token, value = get_value(value) 2323 if not param.extended or param.section_number > 0: 2324 if not value or value[0] != "'": 2325 appendto.append(token) 2326 if remainder is not None: 2327 assert not value, value 2328 value = remainder 2329 return param, value 2330 param.defects.append(errors.InvalidHeaderDefect( 2331 "Apparent initial-extended-value but attribute " 2332 "was not marked as extended or was not initial section")) 2333 if not value: 2334 # Assume the charset/lang is missing and the token is the value. 2335 param.defects.append(errors.InvalidHeaderDefect( 2336 "Missing required charset/lang delimiters")) 2337 appendto.append(token) 2338 if remainder is None: 2339 return param, value 2340 else: 2341 if token is not None: 2342 for t in token: 2343 if t.token_type == 'extended-attrtext': 2344 break 2345 t.token_type == 'attrtext' 2346 appendto.append(t) 2347 param.charset = t.value 2348 if value[0] != "'": 2349 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2350 "delimiter, but found {!r}".format(value)) 2351 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 2352 value = value[1:] 2353 if value and value[0] != "'": 2354 token, value = get_attrtext(value) 2355 appendto.append(token) 2356 param.lang = token.value 2357 if not value or value[0] != "'": 2358 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2359 "delimiter, but found {}".format(value)) 2360 appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) 2361 value = value[1:] 2362 if remainder is not None: 2363 # Treat the rest of value as bare quoted string content. 2364 v = Value() 2365 while value: 2366 if value[0] in WSP: 2367 token, value = get_fws(value) 2368 else: 2369 token, value = get_qcontent(value) 2370 v.append(token) 2371 token = v 2372 else: 2373 token, value = get_value(value) 2374 appendto.append(token) 2375 if remainder is not None: 2376 assert not value, value 2377 value = remainder 2378 return param, value 2379 2380def parse_mime_parameters(value): 2381 """ parameter *( ";" parameter ) 2382 2383 That BNF is meant to indicate this routine should only be called after 2384 finding and handling the leading ';'. There is no corresponding rule in 2385 the formal RFC grammar, but it is more convenient for us for the set of 2386 parameters to be treated as its own TokenList. 2387 2388 This is 'parse' routine because it consumes the reminaing value, but it 2389 would never be called to parse a full header. Instead it is called to 2390 parse everything after the non-parameter value of a specific MIME header. 2391 2392 """ 2393 mime_parameters = MimeParameters() 2394 while value: 2395 try: 2396 token, value = get_parameter(value) 2397 mime_parameters.append(token) 2398 except errors.HeaderParseError as err: 2399 leader = None 2400 if value[0] in CFWS_LEADER: 2401 leader, value = get_cfws(value) 2402 if not value: 2403 mime_parameters.append(leader) 2404 return mime_parameters 2405 if value[0] == ';': 2406 if leader is not None: 2407 mime_parameters.append(leader) 2408 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2409 "parameter entry with no content")) 2410 else: 2411 token, value = get_invalid_parameter(value) 2412 if leader: 2413 token[:0] = [leader] 2414 mime_parameters.append(token) 2415 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2416 "invalid parameter {!r}".format(token))) 2417 if value and value[0] != ';': 2418 # Junk after the otherwise valid parameter. Mark it as 2419 # invalid, but it will have a value. 2420 param = mime_parameters[-1] 2421 param.token_type = 'invalid-parameter' 2422 token, value = get_invalid_parameter(value) 2423 param.extend(token) 2424 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2425 "parameter with invalid trailing text {!r}".format(token))) 2426 if value: 2427 # Must be a ';' at this point. 2428 mime_parameters.append(ValueTerminal(';', 'parameter-separator')) 2429 value = value[1:] 2430 return mime_parameters 2431 2432def _find_mime_parameters(tokenlist, value): 2433 """Do our best to find the parameters in an invalid MIME header 2434 2435 """ 2436 while value and value[0] != ';': 2437 if value[0] in PHRASE_ENDS: 2438 tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) 2439 value = value[1:] 2440 else: 2441 token, value = get_phrase(value) 2442 tokenlist.append(token) 2443 if not value: 2444 return 2445 tokenlist.append(ValueTerminal(';', 'parameter-separator')) 2446 tokenlist.append(parse_mime_parameters(value[1:])) 2447 2448def parse_content_type_header(value): 2449 """ maintype "/" subtype *( ";" parameter ) 2450 2451 The maintype and substype are tokens. Theoretically they could 2452 be checked against the official IANA list + x-token, but we 2453 don't do that. 2454 """ 2455 ctype = ContentType() 2456 recover = False 2457 if not value: 2458 ctype.defects.append(errors.HeaderMissingRequiredValue( 2459 "Missing content type specification")) 2460 return ctype 2461 try: 2462 token, value = get_token(value) 2463 except errors.HeaderParseError: 2464 ctype.defects.append(errors.InvalidHeaderDefect( 2465 "Expected content maintype but found {!r}".format(value))) 2466 _find_mime_parameters(ctype, value) 2467 return ctype 2468 ctype.append(token) 2469 # XXX: If we really want to follow the formal grammar we should make 2470 # mantype and subtype specialized TokenLists here. Probably not worth it. 2471 if not value or value[0] != '/': 2472 ctype.defects.append(errors.InvalidHeaderDefect( 2473 "Invalid content type")) 2474 if value: 2475 _find_mime_parameters(ctype, value) 2476 return ctype 2477 ctype.maintype = token.value.strip().lower() 2478 ctype.append(ValueTerminal('/', 'content-type-separator')) 2479 value = value[1:] 2480 try: 2481 token, value = get_token(value) 2482 except errors.HeaderParseError: 2483 ctype.defects.append(errors.InvalidHeaderDefect( 2484 "Expected content subtype but found {!r}".format(value))) 2485 _find_mime_parameters(ctype, value) 2486 return ctype 2487 ctype.append(token) 2488 ctype.subtype = token.value.strip().lower() 2489 if not value: 2490 return ctype 2491 if value[0] != ';': 2492 ctype.defects.append(errors.InvalidHeaderDefect( 2493 "Only parameters are valid after content type, but " 2494 "found {!r}".format(value))) 2495 # The RFC requires that a syntactically invalid content-type be treated 2496 # as text/plain. Perhaps we should postel this, but we should probably 2497 # only do that if we were checking the subtype value against IANA. 2498 del ctype.maintype, ctype.subtype 2499 _find_mime_parameters(ctype, value) 2500 return ctype 2501 ctype.append(ValueTerminal(';', 'parameter-separator')) 2502 ctype.append(parse_mime_parameters(value[1:])) 2503 return ctype 2504 2505def parse_content_disposition_header(value): 2506 """ disposition-type *( ";" parameter ) 2507 2508 """ 2509 disp_header = ContentDisposition() 2510 if not value: 2511 disp_header.defects.append(errors.HeaderMissingRequiredValue( 2512 "Missing content disposition")) 2513 return disp_header 2514 try: 2515 token, value = get_token(value) 2516 except errors.HeaderParseError: 2517 disp_header.defects.append(errors.InvalidHeaderDefect( 2518 "Expected content disposition but found {!r}".format(value))) 2519 _find_mime_parameters(disp_header, value) 2520 return disp_header 2521 disp_header.append(token) 2522 disp_header.content_disposition = token.value.strip().lower() 2523 if not value: 2524 return disp_header 2525 if value[0] != ';': 2526 disp_header.defects.append(errors.InvalidHeaderDefect( 2527 "Only parameters are valid after content disposition, but " 2528 "found {!r}".format(value))) 2529 _find_mime_parameters(disp_header, value) 2530 return disp_header 2531 disp_header.append(ValueTerminal(';', 'parameter-separator')) 2532 disp_header.append(parse_mime_parameters(value[1:])) 2533 return disp_header 2534 2535def parse_content_transfer_encoding_header(value): 2536 """ mechanism 2537 2538 """ 2539 # We should probably validate the values, since the list is fixed. 2540 cte_header = ContentTransferEncoding() 2541 if not value: 2542 cte_header.defects.append(errors.HeaderMissingRequiredValue( 2543 "Missing content transfer encoding")) 2544 return cte_header 2545 try: 2546 token, value = get_token(value) 2547 except errors.HeaderParseError: 2548 cte_header.defects.append(errors.InvalidHeaderDefect( 2549 "Expected content transfer encoding but found {!r}".format(value))) 2550 else: 2551 cte_header.append(token) 2552 cte_header.cte = token.value.strip().lower() 2553 if not value: 2554 return cte_header 2555 while value: 2556 cte_header.defects.append(errors.InvalidHeaderDefect( 2557 "Extra text after content transfer encoding")) 2558 if value[0] in PHRASE_ENDS: 2559 cte_header.append(ValueTerminal(value[0], 'misplaced-special')) 2560 value = value[1:] 2561 else: 2562 token, value = get_phrase(value) 2563 cte_header.append(token) 2564 return cte_header 2565 2566 2567# 2568# Header folding 2569# 2570# Header folding is complex, with lots of rules and corner cases. The 2571# following code does its best to obey the rules and handle the corner 2572# cases, but you can be sure there are few bugs:) 2573# 2574# This folder generally canonicalizes as it goes, preferring the stringified 2575# version of each token. The tokens contain information that supports the 2576# folder, including which tokens can be encoded in which ways. 2577# 2578# Folded text is accumulated in a simple list of strings ('lines'), each 2579# one of which should be less than policy.max_line_length ('maxlen'). 2580# 2581 2582def _steal_trailing_WSP_if_exists(lines): 2583 wsp = '' 2584 if lines and lines[-1] and lines[-1][-1] in WSP: 2585 wsp = lines[-1][-1] 2586 lines[-1] = lines[-1][:-1] 2587 return wsp 2588 2589def _refold_parse_tree(parse_tree, *, policy): 2590 """Return string of contents of parse_tree folded according to RFC rules. 2591 2592 """ 2593 # max_line_length 0/None means no limit, ie: infinitely long. 2594 maxlen = policy.max_line_length or float("+inf") 2595 encoding = 'utf-8' if policy.utf8 else 'us-ascii' 2596 lines = [''] 2597 last_ew = None 2598 wrap_as_ew_blocked = 0 2599 want_encoding = False 2600 end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') 2601 parts = list(parse_tree) 2602 while parts: 2603 part = parts.pop(0) 2604 if part is end_ew_not_allowed: 2605 wrap_as_ew_blocked -= 1 2606 continue 2607 tstr = str(part) 2608 try: 2609 tstr.encode(encoding) 2610 charset = encoding 2611 except UnicodeEncodeError: 2612 if any(isinstance(x, errors.UndecodableBytesDefect) 2613 for x in part.all_defects): 2614 charset = 'unknown-8bit' 2615 else: 2616 # If policy.utf8 is false this should really be taken from a 2617 # 'charset' property on the policy. 2618 charset = 'utf-8' 2619 want_encoding = True 2620 if part.token_type == 'mime-parameters': 2621 # Mime parameter folding (using RFC2231) is extra special. 2622 _fold_mime_parameters(part, lines, maxlen, encoding) 2623 continue 2624 if want_encoding and not wrap_as_ew_blocked: 2625 if not part.as_ew_allowed: 2626 want_encoding = False 2627 last_ew = None 2628 if part.syntactic_break: 2629 encoded_part = part.fold(policy=policy)[:-1] # strip nl 2630 if policy.linesep not in encoded_part: 2631 # It fits on a single line 2632 if len(encoded_part) > maxlen - len(lines[-1]): 2633 # But not on this one, so start a new one. 2634 newline = _steal_trailing_WSP_if_exists(lines) 2635 # XXX what if encoded_part has no leading FWS? 2636 lines.append(newline) 2637 lines[-1] += encoded_part 2638 continue 2639 # Either this is not a major syntactic break, so we don't 2640 # want it on a line by itself even if it fits, or it 2641 # doesn't fit on a line by itself. Either way, fall through 2642 # to unpacking the subparts and wrapping them. 2643 if not hasattr(part, 'encode'): 2644 # It's not a Terminal, do each piece individually. 2645 parts = list(part) + parts 2646 else: 2647 # It's a terminal, wrap it as an encoded word, possibly 2648 # combining it with previously encoded words if allowed. 2649 last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, 2650 part.ew_combine_allowed, charset) 2651 want_encoding = False 2652 continue 2653 if len(tstr) <= maxlen - len(lines[-1]): 2654 lines[-1] += tstr 2655 continue 2656 # This part is too long to fit. The RFC wants us to break at 2657 # "major syntactic breaks", so unless we don't consider this 2658 # to be one, check if it will fit on the next line by itself. 2659 if (part.syntactic_break and 2660 len(tstr) + 1 <= maxlen): 2661 newline = _steal_trailing_WSP_if_exists(lines) 2662 if newline or part.startswith_fws(): 2663 lines.append(newline + tstr) 2664 continue 2665 if not hasattr(part, 'encode'): 2666 # It's not a terminal, try folding the subparts. 2667 newparts = list(part) 2668 if not part.as_ew_allowed: 2669 wrap_as_ew_blocked += 1 2670 newparts.append(end_ew_not_allowed) 2671 parts = newparts + parts 2672 continue 2673 if part.as_ew_allowed and not wrap_as_ew_blocked: 2674 # It doesn't need CTE encoding, but encode it anyway so we can 2675 # wrap it. 2676 parts.insert(0, part) 2677 want_encoding = True 2678 continue 2679 # We can't figure out how to wrap, it, so give up. 2680 newline = _steal_trailing_WSP_if_exists(lines) 2681 if newline or part.startswith_fws(): 2682 lines.append(newline + tstr) 2683 else: 2684 # We can't fold it onto the next line either... 2685 lines[-1] += tstr 2686 return policy.linesep.join(lines) + policy.linesep 2687 2688def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): 2689 """Fold string to_encode into lines as encoded word, combining if allowed. 2690 Return the new value for last_ew, or None if ew_combine_allowed is False. 2691 2692 If there is already an encoded word in the last line of lines (indicated by 2693 a non-None value for last_ew) and ew_combine_allowed is true, decode the 2694 existing ew, combine it with to_encode, and re-encode. Otherwise, encode 2695 to_encode. In either case, split to_encode as necessary so that the 2696 encoded segments fit within maxlen. 2697 2698 """ 2699 if last_ew is not None and ew_combine_allowed: 2700 to_encode = str( 2701 get_unstructured(lines[-1][last_ew:] + to_encode)) 2702 lines[-1] = lines[-1][:last_ew] 2703 if to_encode[0] in WSP: 2704 # We're joining this to non-encoded text, so don't encode 2705 # the leading blank. 2706 leading_wsp = to_encode[0] 2707 to_encode = to_encode[1:] 2708 if (len(lines[-1]) == maxlen): 2709 lines.append(_steal_trailing_WSP_if_exists(lines)) 2710 lines[-1] += leading_wsp 2711 trailing_wsp = '' 2712 if to_encode[-1] in WSP: 2713 # Likewise for the trailing space. 2714 trailing_wsp = to_encode[-1] 2715 to_encode = to_encode[:-1] 2716 new_last_ew = len(lines[-1]) if last_ew is None else last_ew 2717 while to_encode: 2718 remaining_space = maxlen - len(lines[-1]) 2719 # The RFC2047 chrome takes up 7 characters plus the length 2720 # of the charset name. 2721 encode_as = 'utf-8' if charset == 'us-ascii' else charset 2722 text_space = remaining_space - len(encode_as) - 7 2723 if text_space <= 0: 2724 lines.append(' ') 2725 # XXX We'll get an infinite loop here if maxlen is <= 7 2726 continue 2727 first_part = to_encode[:text_space] 2728 ew = _ew.encode(first_part, charset=encode_as) 2729 excess = len(ew) - remaining_space 2730 if excess > 0: 2731 # encode always chooses the shortest encoding, so this 2732 # is guaranteed to fit at this point. 2733 first_part = first_part[:-excess] 2734 ew = _ew.encode(first_part) 2735 lines[-1] += ew 2736 to_encode = to_encode[len(first_part):] 2737 if to_encode: 2738 lines.append(' ') 2739 new_last_ew = len(lines[-1]) 2740 lines[-1] += trailing_wsp 2741 return new_last_ew if ew_combine_allowed else None 2742 2743def _fold_mime_parameters(part, lines, maxlen, encoding): 2744 """Fold TokenList 'part' into the 'lines' list as mime parameters. 2745 2746 Using the decoded list of parameters and values, format them according to 2747 the RFC rules, including using RFC2231 encoding if the value cannot be 2748 expressed in 'encoding' and/or the parameter+value is too long to fit 2749 within 'maxlen'. 2750 2751 """ 2752 # Special case for RFC2231 encoding: start from decoded values and use 2753 # RFC2231 encoding iff needed. 2754 # 2755 # Note that the 1 and 2s being added to the length calculations are 2756 # accounting for the possibly-needed spaces and semicolons we'll be adding. 2757 # 2758 for name, value in part.params: 2759 # XXX What if this ';' puts us over maxlen the first time through the 2760 # loop? We should split the header value onto a newline in that case, 2761 # but to do that we need to recognize the need earlier or reparse the 2762 # header, so I'm going to ignore that bug for now. It'll only put us 2763 # one character over. 2764 if not lines[-1].rstrip().endswith(';'): 2765 lines[-1] += ';' 2766 charset = encoding 2767 error_handler = 'strict' 2768 try: 2769 value.encode(encoding) 2770 encoding_required = False 2771 except UnicodeEncodeError: 2772 encoding_required = True 2773 if utils._has_surrogates(value): 2774 charset = 'unknown-8bit' 2775 error_handler = 'surrogateescape' 2776 else: 2777 charset = 'utf-8' 2778 if encoding_required: 2779 encoded_value = urllib.parse.quote( 2780 value, safe='', errors=error_handler) 2781 tstr = "{}*={}''{}".format(name, charset, encoded_value) 2782 else: 2783 tstr = '{}={}'.format(name, quote_string(value)) 2784 if len(lines[-1]) + len(tstr) + 1 < maxlen: 2785 lines[-1] = lines[-1] + ' ' + tstr 2786 continue 2787 elif len(tstr) + 2 <= maxlen: 2788 lines.append(' ' + tstr) 2789 continue 2790 # We need multiple sections. We are allowed to mix encoded and 2791 # non-encoded sections, but we aren't going to. We'll encode them all. 2792 section = 0 2793 extra_chrome = charset + "''" 2794 while value: 2795 chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) 2796 if maxlen <= chrome_len + 3: 2797 # We need room for the leading blank, the trailing semicolon, 2798 # and at least one character of the value. If we don't 2799 # have that, we'd be stuck, so in that case fall back to 2800 # the RFC standard width. 2801 maxlen = 78 2802 splitpoint = maxchars = maxlen - chrome_len - 2 2803 while True: 2804 partial = value[:splitpoint] 2805 encoded_value = urllib.parse.quote( 2806 partial, safe='', errors=error_handler) 2807 if len(encoded_value) <= maxchars: 2808 break 2809 splitpoint -= 1 2810 lines.append(" {}*{}*={}{}".format( 2811 name, section, extra_chrome, encoded_value)) 2812 extra_chrome = '' 2813 section += 1 2814 value = value[splitpoint:] 2815 if value: 2816 lines[-1] += ';' 2817