1"""Header value parser implementing various email-related RFC parsing rules.
2
3The parsing methods defined in this module implement various email related
4parsing rules.  Principal among them is RFC 5322, which is the followon
5to RFC 2822 and primarily a clarification of the former.  It also implements
6RFC 2047 encoded word decoding.
7
8RFC 5322 goes to considerable trouble to maintain backward compatibility with
9RFC 822 in the parse phase, while cleaning up the structure on the generation
10phase.  This parser supports correct RFC 5322 generation by tagging white space
11as folding white space only when folding is allowed in the non-obsolete rule
12sets.  Actually, the parser is even more generous when accepting input than RFC
135322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
14Where possible deviations from the standard are annotated on the 'defects'
15attribute of tokens that deviate.
16
17The general structure of the parser follows RFC 5322, and uses its terminology
18where there is a direct correspondence.  Where the implementation requires a
19somewhat different structure than that used by the formal grammar, new terms
20that mimic the closest existing terms are used.  Thus, it really helps to have
21a copy of RFC 5322 handy when studying this code.
22
23Input to the parser is a string that has already been unfolded according to
24RFC 5322 rules.  According to the RFC this unfolding is the very first step, and
25this parser leaves the unfolding step to a higher level message parser, which
26will have already detected the line breaks that need unfolding while
27determining the beginning and end of each header.
28
29The output of the parser is a TokenList object, which is a list subclass.  A
30TokenList is a recursive data structure.  The terminal nodes of the structure
31are Terminal objects, which are subclasses of str.  These do not correspond
32directly to terminal objects in the formal grammar, but are instead more
33practical higher level combinations of true terminals.
34
35All TokenList and Terminal objects have a 'value' attribute, which produces the
36semantically meaningful value of that part of the parse subtree.  The value of
37all whitespace tokens (no matter how many sub-tokens they may contain) is a
38single space, as per the RFC rules.  This includes 'CFWS', which is herein
39included in the general class of whitespace tokens.  There is one exception to
40the rule that whitespace tokens are collapsed into single spaces in values: in
41the value of a 'bare-quoted-string' (a quoted-string with no leading or
42trailing whitespace), any whitespace that appeared between the quotation marks
43is preserved in the returned value.  Note that in all Terminal strings quoted
44pairs are turned into their unquoted values.
45
46All TokenList and Terminal objects also have a string value, which attempts to
47be a "canonical" representation of the RFC-compliant form of the substring that
48produced the parsed subtree, including minimal use of quoted pair quoting.
49Whitespace runs are not collapsed.
50
51Comment tokens also have a 'content' attribute providing the string found
52between the parens (including any nested comments) with whitespace preserved.
53
54All TokenList and Terminal objects have a 'defects' attribute which is a
55possibly empty list all of the defects found while creating the token.  Defects
56may appear on any token in the tree, and a composite list of all defects in the
57subtree is available through the 'all_defects' attribute of any node.  (For
58Terminal notes x.defects == x.all_defects.)
59
60Each object in a parse tree is called a 'token', and each has a 'token_type'
61attribute that gives the name from the RFC 5322 grammar that it represents.
62Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
63may be produced: 'ptext'.  A 'ptext' is a string of printable ascii characters.
64It is returned in place of lists of (ctext/quoted-pair) and
65(qtext/quoted-pair).
66
67XXX: provide complete list of token types.
68"""
69
70import re
71import urllib   # For urllib.parse.unquote
72from string import hexdigits
73from collections import OrderedDict
74from operator import itemgetter
75from email import _encoded_words as _ew
76from email import errors
77from email import utils
78
79#
80# Useful constants and functions
81#
82
83WSP = set(' \t')
84CFWS_LEADER = WSP | set('(')
85SPECIALS = set(r'()<>@,:;.\"[]')
86ATOM_ENDS = SPECIALS | WSP
87DOT_ATOM_ENDS = ATOM_ENDS - set('.')
88# '.', '"', and '(' do not end phrases in order to support obs-phrase
89PHRASE_ENDS = SPECIALS - set('."(')
90TSPECIALS = (SPECIALS | set('/?=')) - set('.')
91TOKEN_ENDS = TSPECIALS | WSP
92ASPECIALS = TSPECIALS | set("*'%")
93ATTRIBUTE_ENDS = ASPECIALS | WSP
94EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
95
96def quote_string(value):
97    return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
98
99#
100# TokenList and its subclasses
101#
102
103class TokenList(list):
104
105    token_type = None
106    syntactic_break = True
107    ew_combine_allowed = True
108
109    def __init__(self, *args, **kw):
110        super().__init__(*args, **kw)
111        self.defects = []
112
113    def __str__(self):
114        return ''.join(str(x) for x in self)
115
116    def __repr__(self):
117        return '{}({})'.format(self.__class__.__name__,
118                             super().__repr__())
119
120    @property
121    def value(self):
122        return ''.join(x.value for x in self if x.value)
123
124    @property
125    def all_defects(self):
126        return sum((x.all_defects for x in self), self.defects)
127
128    def startswith_fws(self):
129        return self[0].startswith_fws()
130
131    @property
132    def as_ew_allowed(self):
133        """True if all top level tokens of this part may be RFC2047 encoded."""
134        return all(part.as_ew_allowed for part in self)
135
136    @property
137    def comments(self):
138        comments = []
139        for token in self:
140            comments.extend(token.comments)
141        return comments
142
143    def fold(self, *, policy):
144        return _refold_parse_tree(self, policy=policy)
145
146    def pprint(self, indent=''):
147        print(self.ppstr(indent=indent))
148
149    def ppstr(self, indent=''):
150        return '\n'.join(self._pp(indent=indent))
151
152    def _pp(self, indent=''):
153        yield '{}{}/{}('.format(
154            indent,
155            self.__class__.__name__,
156            self.token_type)
157        for token in self:
158            if not hasattr(token, '_pp'):
159                yield (indent + '    !! invalid element in token '
160                                        'list: {!r}'.format(token))
161            else:
162                yield from token._pp(indent+'    ')
163        if self.defects:
164            extra = ' Defects: {}'.format(self.defects)
165        else:
166            extra = ''
167        yield '{}){}'.format(indent, extra)
168
169
170class WhiteSpaceTokenList(TokenList):
171
172    @property
173    def value(self):
174        return ' '
175
176    @property
177    def comments(self):
178        return [x.content for x in self if x.token_type=='comment']
179
180
181class UnstructuredTokenList(TokenList):
182
183    token_type = 'unstructured'
184
185
186class Phrase(TokenList):
187
188    token_type = 'phrase'
189
190class Word(TokenList):
191
192    token_type = 'word'
193
194
195class CFWSList(WhiteSpaceTokenList):
196
197    token_type = 'cfws'
198
199
200class Atom(TokenList):
201
202    token_type = 'atom'
203
204
205class Token(TokenList):
206
207    token_type = 'token'
208    encode_as_ew = False
209
210
211class EncodedWord(TokenList):
212
213    token_type = 'encoded-word'
214    cte = None
215    charset = None
216    lang = None
217
218
219class QuotedString(TokenList):
220
221    token_type = 'quoted-string'
222
223    @property
224    def content(self):
225        for x in self:
226            if x.token_type == 'bare-quoted-string':
227                return x.value
228
229    @property
230    def quoted_value(self):
231        res = []
232        for x in self:
233            if x.token_type == 'bare-quoted-string':
234                res.append(str(x))
235            else:
236                res.append(x.value)
237        return ''.join(res)
238
239    @property
240    def stripped_value(self):
241        for token in self:
242            if token.token_type == 'bare-quoted-string':
243                return token.value
244
245
246class BareQuotedString(QuotedString):
247
248    token_type = 'bare-quoted-string'
249
250    def __str__(self):
251        return quote_string(''.join(str(x) for x in self))
252
253    @property
254    def value(self):
255        return ''.join(str(x) for x in self)
256
257
258class Comment(WhiteSpaceTokenList):
259
260    token_type = 'comment'
261
262    def __str__(self):
263        return ''.join(sum([
264                            ["("],
265                            [self.quote(x) for x in self],
266                            [")"],
267                            ], []))
268
269    def quote(self, value):
270        if value.token_type == 'comment':
271            return str(value)
272        return str(value).replace('\\', '\\\\').replace(
273                                  '(', r'\(').replace(
274                                  ')', r'\)')
275
276    @property
277    def content(self):
278        return ''.join(str(x) for x in self)
279
280    @property
281    def comments(self):
282        return [self.content]
283
284class AddressList(TokenList):
285
286    token_type = 'address-list'
287
288    @property
289    def addresses(self):
290        return [x for x in self if x.token_type=='address']
291
292    @property
293    def mailboxes(self):
294        return sum((x.mailboxes
295                    for x in self if x.token_type=='address'), [])
296
297    @property
298    def all_mailboxes(self):
299        return sum((x.all_mailboxes
300                    for x in self if x.token_type=='address'), [])
301
302
303class Address(TokenList):
304
305    token_type = 'address'
306
307    @property
308    def display_name(self):
309        if self[0].token_type == 'group':
310            return self[0].display_name
311
312    @property
313    def mailboxes(self):
314        if self[0].token_type == 'mailbox':
315            return [self[0]]
316        elif self[0].token_type == 'invalid-mailbox':
317            return []
318        return self[0].mailboxes
319
320    @property
321    def all_mailboxes(self):
322        if self[0].token_type == 'mailbox':
323            return [self[0]]
324        elif self[0].token_type == 'invalid-mailbox':
325            return [self[0]]
326        return self[0].all_mailboxes
327
328class MailboxList(TokenList):
329
330    token_type = 'mailbox-list'
331
332    @property
333    def mailboxes(self):
334        return [x for x in self if x.token_type=='mailbox']
335
336    @property
337    def all_mailboxes(self):
338        return [x for x in self
339            if x.token_type in ('mailbox', 'invalid-mailbox')]
340
341
342class GroupList(TokenList):
343
344    token_type = 'group-list'
345
346    @property
347    def mailboxes(self):
348        if not self or self[0].token_type != 'mailbox-list':
349            return []
350        return self[0].mailboxes
351
352    @property
353    def all_mailboxes(self):
354        if not self or self[0].token_type != 'mailbox-list':
355            return []
356        return self[0].all_mailboxes
357
358
359class Group(TokenList):
360
361    token_type = "group"
362
363    @property
364    def mailboxes(self):
365        if self[2].token_type != 'group-list':
366            return []
367        return self[2].mailboxes
368
369    @property
370    def all_mailboxes(self):
371        if self[2].token_type != 'group-list':
372            return []
373        return self[2].all_mailboxes
374
375    @property
376    def display_name(self):
377        return self[0].display_name
378
379
380class NameAddr(TokenList):
381
382    token_type = 'name-addr'
383
384    @property
385    def display_name(self):
386        if len(self) == 1:
387            return None
388        return self[0].display_name
389
390    @property
391    def local_part(self):
392        return self[-1].local_part
393
394    @property
395    def domain(self):
396        return self[-1].domain
397
398    @property
399    def route(self):
400        return self[-1].route
401
402    @property
403    def addr_spec(self):
404        return self[-1].addr_spec
405
406
407class AngleAddr(TokenList):
408
409    token_type = 'angle-addr'
410
411    @property
412    def local_part(self):
413        for x in self:
414            if x.token_type == 'addr-spec':
415                return x.local_part
416
417    @property
418    def domain(self):
419        for x in self:
420            if x.token_type == 'addr-spec':
421                return x.domain
422
423    @property
424    def route(self):
425        for x in self:
426            if x.token_type == 'obs-route':
427                return x.domains
428
429    @property
430    def addr_spec(self):
431        for x in self:
432            if x.token_type == 'addr-spec':
433                if x.local_part:
434                    return x.addr_spec
435                else:
436                    return quote_string(x.local_part) + x.addr_spec
437        else:
438            return '<>'
439
440
441class ObsRoute(TokenList):
442
443    token_type = 'obs-route'
444
445    @property
446    def domains(self):
447        return [x.domain for x in self if x.token_type == 'domain']
448
449
450class Mailbox(TokenList):
451
452    token_type = 'mailbox'
453
454    @property
455    def display_name(self):
456        if self[0].token_type == 'name-addr':
457            return self[0].display_name
458
459    @property
460    def local_part(self):
461        return self[0].local_part
462
463    @property
464    def domain(self):
465        return self[0].domain
466
467    @property
468    def route(self):
469        if self[0].token_type == 'name-addr':
470            return self[0].route
471
472    @property
473    def addr_spec(self):
474        return self[0].addr_spec
475
476
477class InvalidMailbox(TokenList):
478
479    token_type = 'invalid-mailbox'
480
481    @property
482    def display_name(self):
483        return None
484
485    local_part = domain = route = addr_spec = display_name
486
487
488class Domain(TokenList):
489
490    token_type = 'domain'
491    as_ew_allowed = False
492
493    @property
494    def domain(self):
495        return ''.join(super().value.split())
496
497
498class DotAtom(TokenList):
499
500    token_type = 'dot-atom'
501
502
503class DotAtomText(TokenList):
504
505    token_type = 'dot-atom-text'
506    as_ew_allowed = True
507
508
509class AddrSpec(TokenList):
510
511    token_type = 'addr-spec'
512    as_ew_allowed = False
513
514    @property
515    def local_part(self):
516        return self[0].local_part
517
518    @property
519    def domain(self):
520        if len(self) < 3:
521            return None
522        return self[-1].domain
523
524    @property
525    def value(self):
526        if len(self) < 3:
527            return self[0].value
528        return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
529
530    @property
531    def addr_spec(self):
532        nameset = set(self.local_part)
533        if len(nameset) > len(nameset-DOT_ATOM_ENDS):
534            lp = quote_string(self.local_part)
535        else:
536            lp = self.local_part
537        if self.domain is not None:
538            return lp + '@' + self.domain
539        return lp
540
541
542class ObsLocalPart(TokenList):
543
544    token_type = 'obs-local-part'
545    as_ew_allowed = False
546
547
548class DisplayName(Phrase):
549
550    token_type = 'display-name'
551    ew_combine_allowed = False
552
553    @property
554    def display_name(self):
555        res = TokenList(self)
556        if res[0].token_type == 'cfws':
557            res.pop(0)
558        else:
559            if res[0][0].token_type == 'cfws':
560                res[0] = TokenList(res[0][1:])
561        if res[-1].token_type == 'cfws':
562            res.pop()
563        else:
564            if res[-1][-1].token_type == 'cfws':
565                res[-1] = TokenList(res[-1][:-1])
566        return res.value
567
568    @property
569    def value(self):
570        quote = False
571        if self.defects:
572            quote = True
573        else:
574            for x in self:
575                if x.token_type == 'quoted-string':
576                    quote = True
577        if quote:
578            pre = post = ''
579            if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
580                pre = ' '
581            if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
582                post = ' '
583            return pre+quote_string(self.display_name)+post
584        else:
585            return super().value
586
587
588class LocalPart(TokenList):
589
590    token_type = 'local-part'
591    as_ew_allowed = False
592
593    @property
594    def value(self):
595        if self[0].token_type == "quoted-string":
596            return self[0].quoted_value
597        else:
598            return self[0].value
599
600    @property
601    def local_part(self):
602        # Strip whitespace from front, back, and around dots.
603        res = [DOT]
604        last = DOT
605        last_is_tl = False
606        for tok in self[0] + [DOT]:
607            if tok.token_type == 'cfws':
608                continue
609            if (last_is_tl and tok.token_type == 'dot' and
610                    last[-1].token_type == 'cfws'):
611                res[-1] = TokenList(last[:-1])
612            is_tl = isinstance(tok, TokenList)
613            if (is_tl and last.token_type == 'dot' and
614                    tok[0].token_type == 'cfws'):
615                res.append(TokenList(tok[1:]))
616            else:
617                res.append(tok)
618            last = res[-1]
619            last_is_tl = is_tl
620        res = TokenList(res[1:-1])
621        return res.value
622
623
624class DomainLiteral(TokenList):
625
626    token_type = 'domain-literal'
627    as_ew_allowed = False
628
629    @property
630    def domain(self):
631        return ''.join(super().value.split())
632
633    @property
634    def ip(self):
635        for x in self:
636            if x.token_type == 'ptext':
637                return x.value
638
639
640class MIMEVersion(TokenList):
641
642    token_type = 'mime-version'
643    major = None
644    minor = None
645
646
647class Parameter(TokenList):
648
649    token_type = 'parameter'
650    sectioned = False
651    extended = False
652    charset = 'us-ascii'
653
654    @property
655    def section_number(self):
656        # Because the first token, the attribute (name) eats CFWS, the second
657        # token is always the section if there is one.
658        return self[1].number if self.sectioned else 0
659
660    @property
661    def param_value(self):
662        # This is part of the "handle quoted extended parameters" hack.
663        for token in self:
664            if token.token_type == 'value':
665                return token.stripped_value
666            if token.token_type == 'quoted-string':
667                for token in token:
668                    if token.token_type == 'bare-quoted-string':
669                        for token in token:
670                            if token.token_type == 'value':
671                                return token.stripped_value
672        return ''
673
674
675class InvalidParameter(Parameter):
676
677    token_type = 'invalid-parameter'
678
679
680class Attribute(TokenList):
681
682    token_type = 'attribute'
683
684    @property
685    def stripped_value(self):
686        for token in self:
687            if token.token_type.endswith('attrtext'):
688                return token.value
689
690class Section(TokenList):
691
692    token_type = 'section'
693    number = None
694
695
696class Value(TokenList):
697
698    token_type = 'value'
699
700    @property
701    def stripped_value(self):
702        token = self[0]
703        if token.token_type == 'cfws':
704            token = self[1]
705        if token.token_type.endswith(
706                ('quoted-string', 'attribute', 'extended-attribute')):
707            return token.stripped_value
708        return self.value
709
710
711class MimeParameters(TokenList):
712
713    token_type = 'mime-parameters'
714    syntactic_break = False
715
716    @property
717    def params(self):
718        # The RFC specifically states that the ordering of parameters is not
719        # guaranteed and may be reordered by the transport layer.  So we have
720        # to assume the RFC 2231 pieces can come in any order.  However, we
721        # output them in the order that we first see a given name, which gives
722        # us a stable __str__.
723        params = OrderedDict()
724        for token in self:
725            if not token.token_type.endswith('parameter'):
726                continue
727            if token[0].token_type != 'attribute':
728                continue
729            name = token[0].value.strip()
730            if name not in params:
731                params[name] = []
732            params[name].append((token.section_number, token))
733        for name, parts in params.items():
734            parts = sorted(parts, key=itemgetter(0))
735            first_param = parts[0][1]
736            charset = first_param.charset
737            # Our arbitrary error recovery is to ignore duplicate parameters,
738            # to use appearance order if there are duplicate rfc 2231 parts,
739            # and to ignore gaps.  This mimics the error recovery of get_param.
740            if not first_param.extended and len(parts) > 1:
741                if parts[1][0] == 0:
742                    parts[1][1].defects.append(errors.InvalidHeaderDefect(
743                        'duplicate parameter name; duplicate(s) ignored'))
744                    parts = parts[:1]
745                # Else assume the *0* was missing...note that this is different
746                # from get_param, but we registered a defect for this earlier.
747            value_parts = []
748            i = 0
749            for section_number, param in parts:
750                if section_number != i:
751                    # We could get fancier here and look for a complete
752                    # duplicate extended parameter and ignore the second one
753                    # seen.  But we're not doing that.  The old code didn't.
754                    if not param.extended:
755                        param.defects.append(errors.InvalidHeaderDefect(
756                            'duplicate parameter name; duplicate ignored'))
757                        continue
758                    else:
759                        param.defects.append(errors.InvalidHeaderDefect(
760                            "inconsistent RFC2231 parameter numbering"))
761                i += 1
762                value = param.param_value
763                if param.extended:
764                    try:
765                        value = urllib.parse.unquote_to_bytes(value)
766                    except UnicodeEncodeError:
767                        # source had surrogate escaped bytes.  What we do now
768                        # is a bit of an open question.  I'm not sure this is
769                        # the best choice, but it is what the old algorithm did
770                        value = urllib.parse.unquote(value, encoding='latin-1')
771                    else:
772                        try:
773                            value = value.decode(charset, 'surrogateescape')
774                        except LookupError:
775                            # XXX: there should really be a custom defect for
776                            # unknown character set to make it easy to find,
777                            # because otherwise unknown charset is a silent
778                            # failure.
779                            value = value.decode('us-ascii', 'surrogateescape')
780                        if utils._has_surrogates(value):
781                            param.defects.append(errors.UndecodableBytesDefect())
782                value_parts.append(value)
783            value = ''.join(value_parts)
784            yield name, value
785
786    def __str__(self):
787        params = []
788        for name, value in self.params:
789            if value:
790                params.append('{}={}'.format(name, quote_string(value)))
791            else:
792                params.append(name)
793        params = '; '.join(params)
794        return ' ' + params if params else ''
795
796
797class ParameterizedHeaderValue(TokenList):
798
799    # Set this false so that the value doesn't wind up on a new line even
800    # if it and the parameters would fit there but not on the first line.
801    syntactic_break = False
802
803    @property
804    def params(self):
805        for token in reversed(self):
806            if token.token_type == 'mime-parameters':
807                return token.params
808        return {}
809
810
811class ContentType(ParameterizedHeaderValue):
812
813    token_type = 'content-type'
814    as_ew_allowed = False
815    maintype = 'text'
816    subtype = 'plain'
817
818
819class ContentDisposition(ParameterizedHeaderValue):
820
821    token_type = 'content-disposition'
822    as_ew_allowed = False
823    content_disposition = None
824
825
826class ContentTransferEncoding(TokenList):
827
828    token_type = 'content-transfer-encoding'
829    as_ew_allowed = False
830    cte = '7bit'
831
832
833class HeaderLabel(TokenList):
834
835    token_type = 'header-label'
836    as_ew_allowed = False
837
838
839class Header(TokenList):
840
841    token_type = 'header'
842
843
844#
845# Terminal classes and instances
846#
847
848class Terminal(str):
849
850    as_ew_allowed = True
851    ew_combine_allowed = True
852    syntactic_break = True
853
854    def __new__(cls, value, token_type):
855        self = super().__new__(cls, value)
856        self.token_type = token_type
857        self.defects = []
858        return self
859
860    def __repr__(self):
861        return "{}({})".format(self.__class__.__name__, super().__repr__())
862
863    def pprint(self):
864        print(self.__class__.__name__ + '/' + self.token_type)
865
866    @property
867    def all_defects(self):
868        return list(self.defects)
869
870    def _pp(self, indent=''):
871        return ["{}{}/{}({}){}".format(
872            indent,
873            self.__class__.__name__,
874            self.token_type,
875            super().__repr__(),
876            '' if not self.defects else ' {}'.format(self.defects),
877            )]
878
879    def pop_trailing_ws(self):
880        # This terminates the recursion.
881        return None
882
883    @property
884    def comments(self):
885        return []
886
887    def __getnewargs__(self):
888        return(str(self), self.token_type)
889
890
891class WhiteSpaceTerminal(Terminal):
892
893    @property
894    def value(self):
895        return ' '
896
897    def startswith_fws(self):
898        return True
899
900
901class ValueTerminal(Terminal):
902
903    @property
904    def value(self):
905        return self
906
907    def startswith_fws(self):
908        return False
909
910
911class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
912
913    @property
914    def value(self):
915        return ''
916
917    def __str__(self):
918        return ''
919
920
921# XXX these need to become classes and used as instances so
922# that a program can't change them in a parse tree and screw
923# up other parse trees.  Maybe should have  tests for that, too.
924DOT = ValueTerminal('.', 'dot')
925ListSeparator = ValueTerminal(',', 'list-separator')
926RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
927
928#
929# Parser
930#
931
932# Parse strings according to RFC822/2047/2822/5322 rules.
933#
934# This is a stateless parser.  Each get_XXX function accepts a string and
935# returns either a Terminal or a TokenList representing the RFC object named
936# by the method and a string containing the remaining unparsed characters
937# from the input.  Thus a parser method consumes the next syntactic construct
938# of a given type and returns a token representing the construct plus the
939# unparsed remainder of the input string.
940#
941# For example, if the first element of a structured header is a 'phrase',
942# then:
943#
944#     phrase, value = get_phrase(value)
945#
946# returns the complete phrase from the start of the string value, plus any
947# characters left in the string after the phrase is removed.
948
949_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
950_non_atom_end_matcher = re.compile(r"[^{}]+".format(
951    re.escape(''.join(ATOM_ENDS)))).match
952_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
953_non_token_end_matcher = re.compile(r"[^{}]+".format(
954    re.escape(''.join(TOKEN_ENDS)))).match
955_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
956    re.escape(''.join(ATTRIBUTE_ENDS)))).match
957_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
958    re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
959
960def _validate_xtext(xtext):
961    """If input token contains ASCII non-printables, register a defect."""
962
963    non_printables = _non_printable_finder(xtext)
964    if non_printables:
965        xtext.defects.append(errors.NonPrintableDefect(non_printables))
966    if utils._has_surrogates(xtext):
967        xtext.defects.append(errors.UndecodableBytesDefect(
968            "Non-ASCII characters found in header token"))
969
970def _get_ptext_to_endchars(value, endchars):
971    """Scan printables/quoted-pairs until endchars and return unquoted ptext.
972
973    This function turns a run of qcontent, ccontent-without-comments, or
974    dtext-with-quoted-printables into a single string by unquoting any
975    quoted printables.  It returns the string, the remaining value, and
976    a flag that is True iff there were any quoted printables decoded.
977
978    """
979    fragment, *remainder = _wsp_splitter(value, 1)
980    vchars = []
981    escape = False
982    had_qp = False
983    for pos in range(len(fragment)):
984        if fragment[pos] == '\\':
985            if escape:
986                escape = False
987                had_qp = True
988            else:
989                escape = True
990                continue
991        if escape:
992            escape = False
993        elif fragment[pos] in endchars:
994            break
995        vchars.append(fragment[pos])
996    else:
997        pos = pos + 1
998    return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
999
1000def get_fws(value):
1001    """FWS = 1*WSP
1002
1003    This isn't the RFC definition.  We're using fws to represent tokens where
1004    folding can be done, but when we are parsing the *un*folding has already
1005    been done so we don't need to watch out for CRLF.
1006
1007    """
1008    newvalue = value.lstrip()
1009    fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
1010    return fws, newvalue
1011
1012def get_encoded_word(value):
1013    """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
1014
1015    """
1016    ew = EncodedWord()
1017    if not value.startswith('=?'):
1018        raise errors.HeaderParseError(
1019            "expected encoded word but found {}".format(value))
1020    tok, *remainder = value[2:].split('?=', 1)
1021    if tok == value[2:]:
1022        raise errors.HeaderParseError(
1023            "expected encoded word but found {}".format(value))
1024    remstr = ''.join(remainder)
1025    if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1026        # The ? after the CTE was followed by an encoded word escape (=XX).
1027        rest, *remainder = remstr.split('?=', 1)
1028        tok = tok + '?=' + rest
1029    if len(tok.split()) > 1:
1030        ew.defects.append(errors.InvalidHeaderDefect(
1031            "whitespace inside encoded word"))
1032    ew.cte = value
1033    value = ''.join(remainder)
1034    try:
1035        text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
1036    except ValueError:
1037        raise errors.HeaderParseError(
1038            "encoded word format invalid: '{}'".format(ew.cte))
1039    ew.charset = charset
1040    ew.lang = lang
1041    ew.defects.extend(defects)
1042    while text:
1043        if text[0] in WSP:
1044            token, text = get_fws(text)
1045            ew.append(token)
1046            continue
1047        chars, *remainder = _wsp_splitter(text, 1)
1048        vtext = ValueTerminal(chars, 'vtext')
1049        _validate_xtext(vtext)
1050        ew.append(vtext)
1051        text = ''.join(remainder)
1052    return ew, value
1053
1054def get_unstructured(value):
1055    """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
1056       obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
1057       obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
1058
1059       obs-NO-WS-CTL is control characters except WSP/CR/LF.
1060
1061    So, basically, we have printable runs, plus control characters or nulls in
1062    the obsolete syntax, separated by whitespace.  Since RFC 2047 uses the
1063    obsolete syntax in its specification, but requires whitespace on either
1064    side of the encoded words, I can see no reason to need to separate the
1065    non-printable-non-whitespace from the printable runs if they occur, so we
1066    parse this into xtext tokens separated by WSP tokens.
1067
1068    Because an 'unstructured' value must by definition constitute the entire
1069    value, this 'get' routine does not return a remaining value, only the
1070    parsed TokenList.
1071
1072    """
1073    # XXX: but what about bare CR and LF?  They might signal the start or
1074    # end of an encoded word.  YAGNI for now, since our current parsers
1075    # will never send us strings with bare CR or LF.
1076
1077    unstructured = UnstructuredTokenList()
1078    while value:
1079        if value[0] in WSP:
1080            token, value = get_fws(value)
1081            unstructured.append(token)
1082            continue
1083        if value.startswith('=?'):
1084            try:
1085                token, value = get_encoded_word(value)
1086            except errors.HeaderParseError:
1087                # XXX: Need to figure out how to register defects when
1088                # appropriate here.
1089                pass
1090            else:
1091                have_ws = True
1092                if len(unstructured) > 0:
1093                    if unstructured[-1].token_type != 'fws':
1094                        unstructured.defects.append(errors.InvalidHeaderDefect(
1095                            "missing whitespace before encoded word"))
1096                        have_ws = False
1097                if have_ws and len(unstructured) > 1:
1098                    if unstructured[-2].token_type == 'encoded-word':
1099                        unstructured[-1] = EWWhiteSpaceTerminal(
1100                            unstructured[-1], 'fws')
1101                unstructured.append(token)
1102                continue
1103        tok, *remainder = _wsp_splitter(value, 1)
1104        vtext = ValueTerminal(tok, 'vtext')
1105        _validate_xtext(vtext)
1106        unstructured.append(vtext)
1107        value = ''.join(remainder)
1108    return unstructured
1109
1110def get_qp_ctext(value):
1111    r"""ctext = <printable ascii except \ ( )>
1112
1113    This is not the RFC ctext, since we are handling nested comments in comment
1114    and unquoting quoted-pairs here.  We allow anything except the '()'
1115    characters, but if we find any ASCII other than the RFC defined printable
1116    ASCII, a NonPrintableDefect is added to the token's defects list.  Since
1117    quoted pairs are converted to their unquoted values, what is returned is
1118    a 'ptext' token.  In this case it is a WhiteSpaceTerminal, so it's value
1119    is ' '.
1120
1121    """
1122    ptext, value, _ = _get_ptext_to_endchars(value, '()')
1123    ptext = WhiteSpaceTerminal(ptext, 'ptext')
1124    _validate_xtext(ptext)
1125    return ptext, value
1126
1127def get_qcontent(value):
1128    """qcontent = qtext / quoted-pair
1129
1130    We allow anything except the DQUOTE character, but if we find any ASCII
1131    other than the RFC defined printable ASCII, a NonPrintableDefect is
1132    added to the token's defects list.  Any quoted pairs are converted to their
1133    unquoted values, so what is returned is a 'ptext' token.  In this case it
1134    is a ValueTerminal.
1135
1136    """
1137    ptext, value, _ = _get_ptext_to_endchars(value, '"')
1138    ptext = ValueTerminal(ptext, 'ptext')
1139    _validate_xtext(ptext)
1140    return ptext, value
1141
1142def get_atext(value):
1143    """atext = <matches _atext_matcher>
1144
1145    We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
1146    the token's defects list if we find non-atext characters.
1147    """
1148    m = _non_atom_end_matcher(value)
1149    if not m:
1150        raise errors.HeaderParseError(
1151            "expected atext but found '{}'".format(value))
1152    atext = m.group()
1153    value = value[len(atext):]
1154    atext = ValueTerminal(atext, 'atext')
1155    _validate_xtext(atext)
1156    return atext, value
1157
1158def get_bare_quoted_string(value):
1159    """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
1160
1161    A quoted-string without the leading or trailing white space.  Its
1162    value is the text between the quote marks, with whitespace
1163    preserved and quoted pairs decoded.
1164    """
1165    if value[0] != '"':
1166        raise errors.HeaderParseError(
1167            "expected '\"' but found '{}'".format(value))
1168    bare_quoted_string = BareQuotedString()
1169    value = value[1:]
1170    if value[0] == '"':
1171        token, value = get_qcontent(value)
1172        bare_quoted_string.append(token)
1173    while value and value[0] != '"':
1174        if value[0] in WSP:
1175            token, value = get_fws(value)
1176        elif value[:2] == '=?':
1177            try:
1178                token, value = get_encoded_word(value)
1179                bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1180                    "encoded word inside quoted string"))
1181            except errors.HeaderParseError:
1182                token, value = get_qcontent(value)
1183        else:
1184            token, value = get_qcontent(value)
1185        bare_quoted_string.append(token)
1186    if not value:
1187        bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
1188            "end of header inside quoted string"))
1189        return bare_quoted_string, value
1190    return bare_quoted_string, value[1:]
1191
1192def get_comment(value):
1193    """comment = "(" *([FWS] ccontent) [FWS] ")"
1194       ccontent = ctext / quoted-pair / comment
1195
1196    We handle nested comments here, and quoted-pair in our qp-ctext routine.
1197    """
1198    if value and value[0] != '(':
1199        raise errors.HeaderParseError(
1200            "expected '(' but found '{}'".format(value))
1201    comment = Comment()
1202    value = value[1:]
1203    while value and value[0] != ")":
1204        if value[0] in WSP:
1205            token, value = get_fws(value)
1206        elif value[0] == '(':
1207            token, value = get_comment(value)
1208        else:
1209            token, value = get_qp_ctext(value)
1210        comment.append(token)
1211    if not value:
1212        comment.defects.append(errors.InvalidHeaderDefect(
1213            "end of header inside comment"))
1214        return comment, value
1215    return comment, value[1:]
1216
1217def get_cfws(value):
1218    """CFWS = (1*([FWS] comment) [FWS]) / FWS
1219
1220    """
1221    cfws = CFWSList()
1222    while value and value[0] in CFWS_LEADER:
1223        if value[0] in WSP:
1224            token, value = get_fws(value)
1225        else:
1226            token, value = get_comment(value)
1227        cfws.append(token)
1228    return cfws, value
1229
1230def get_quoted_string(value):
1231    """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
1232
1233    'bare-quoted-string' is an intermediate class defined by this
1234    parser and not by the RFC grammar.  It is the quoted string
1235    without any attached CFWS.
1236    """
1237    quoted_string = QuotedString()
1238    if value and value[0] in CFWS_LEADER:
1239        token, value = get_cfws(value)
1240        quoted_string.append(token)
1241    token, value = get_bare_quoted_string(value)
1242    quoted_string.append(token)
1243    if value and value[0] in CFWS_LEADER:
1244        token, value = get_cfws(value)
1245        quoted_string.append(token)
1246    return quoted_string, value
1247
1248def get_atom(value):
1249    """atom = [CFWS] 1*atext [CFWS]
1250
1251    An atom could be an rfc2047 encoded word.
1252    """
1253    atom = Atom()
1254    if value and value[0] in CFWS_LEADER:
1255        token, value = get_cfws(value)
1256        atom.append(token)
1257    if value and value[0] in ATOM_ENDS:
1258        raise errors.HeaderParseError(
1259            "expected atom but found '{}'".format(value))
1260    if value.startswith('=?'):
1261        try:
1262            token, value = get_encoded_word(value)
1263        except errors.HeaderParseError:
1264            # XXX: need to figure out how to register defects when
1265            # appropriate here.
1266            token, value = get_atext(value)
1267    else:
1268        token, value = get_atext(value)
1269    atom.append(token)
1270    if value and value[0] in CFWS_LEADER:
1271        token, value = get_cfws(value)
1272        atom.append(token)
1273    return atom, value
1274
1275def get_dot_atom_text(value):
1276    """ dot-text = 1*atext *("." 1*atext)
1277
1278    """
1279    dot_atom_text = DotAtomText()
1280    if not value or value[0] in ATOM_ENDS:
1281        raise errors.HeaderParseError("expected atom at a start of "
1282            "dot-atom-text but found '{}'".format(value))
1283    while value and value[0] not in ATOM_ENDS:
1284        token, value = get_atext(value)
1285        dot_atom_text.append(token)
1286        if value and value[0] == '.':
1287            dot_atom_text.append(DOT)
1288            value = value[1:]
1289    if dot_atom_text[-1] is DOT:
1290        raise errors.HeaderParseError("expected atom at end of dot-atom-text "
1291            "but found '{}'".format('.'+value))
1292    return dot_atom_text, value
1293
1294def get_dot_atom(value):
1295    """ dot-atom = [CFWS] dot-atom-text [CFWS]
1296
1297    Any place we can have a dot atom, we could instead have an rfc2047 encoded
1298    word.
1299    """
1300    dot_atom = DotAtom()
1301    if value[0] in CFWS_LEADER:
1302        token, value = get_cfws(value)
1303        dot_atom.append(token)
1304    if value.startswith('=?'):
1305        try:
1306            token, value = get_encoded_word(value)
1307        except errors.HeaderParseError:
1308            # XXX: need to figure out how to register defects when
1309            # appropriate here.
1310            token, value = get_dot_atom_text(value)
1311    else:
1312        token, value = get_dot_atom_text(value)
1313    dot_atom.append(token)
1314    if value and value[0] in CFWS_LEADER:
1315        token, value = get_cfws(value)
1316        dot_atom.append(token)
1317    return dot_atom, value
1318
1319def get_word(value):
1320    """word = atom / quoted-string
1321
1322    Either atom or quoted-string may start with CFWS.  We have to peel off this
1323    CFWS first to determine which type of word to parse.  Afterward we splice
1324    the leading CFWS, if any, into the parsed sub-token.
1325
1326    If neither an atom or a quoted-string is found before the next special, a
1327    HeaderParseError is raised.
1328
1329    The token returned is either an Atom or a QuotedString, as appropriate.
1330    This means the 'word' level of the formal grammar is not represented in the
1331    parse tree; this is because having that extra layer when manipulating the
1332    parse tree is more confusing than it is helpful.
1333
1334    """
1335    if value[0] in CFWS_LEADER:
1336        leader, value = get_cfws(value)
1337    else:
1338        leader = None
1339    if value[0]=='"':
1340        token, value = get_quoted_string(value)
1341    elif value[0] in SPECIALS:
1342        raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
1343                                      "but found '{}'".format(value))
1344    else:
1345        token, value = get_atom(value)
1346    if leader is not None:
1347        token[:0] = [leader]
1348    return token, value
1349
1350def get_phrase(value):
1351    """ phrase = 1*word / obs-phrase
1352        obs-phrase = word *(word / "." / CFWS)
1353
1354    This means a phrase can be a sequence of words, periods, and CFWS in any
1355    order as long as it starts with at least one word.  If anything other than
1356    words is detected, an ObsoleteHeaderDefect is added to the token's defect
1357    list.  We also accept a phrase that starts with CFWS followed by a dot;
1358    this is registered as an InvalidHeaderDefect, since it is not supported by
1359    even the obsolete grammar.
1360
1361    """
1362    phrase = Phrase()
1363    try:
1364        token, value = get_word(value)
1365        phrase.append(token)
1366    except errors.HeaderParseError:
1367        phrase.defects.append(errors.InvalidHeaderDefect(
1368            "phrase does not start with word"))
1369    while value and value[0] not in PHRASE_ENDS:
1370        if value[0]=='.':
1371            phrase.append(DOT)
1372            phrase.defects.append(errors.ObsoleteHeaderDefect(
1373                "period in 'phrase'"))
1374            value = value[1:]
1375        else:
1376            try:
1377                token, value = get_word(value)
1378            except errors.HeaderParseError:
1379                if value[0] in CFWS_LEADER:
1380                    token, value = get_cfws(value)
1381                    phrase.defects.append(errors.ObsoleteHeaderDefect(
1382                        "comment found without atom"))
1383                else:
1384                    raise
1385            phrase.append(token)
1386    return phrase, value
1387
1388def get_local_part(value):
1389    """ local-part = dot-atom / quoted-string / obs-local-part
1390
1391    """
1392    local_part = LocalPart()
1393    leader = None
1394    if value[0] in CFWS_LEADER:
1395        leader, value = get_cfws(value)
1396    if not value:
1397        raise errors.HeaderParseError(
1398            "expected local-part but found '{}'".format(value))
1399    try:
1400        token, value = get_dot_atom(value)
1401    except errors.HeaderParseError:
1402        try:
1403            token, value = get_word(value)
1404        except errors.HeaderParseError:
1405            if value[0] != '\\' and value[0] in PHRASE_ENDS:
1406                raise
1407            token = TokenList()
1408    if leader is not None:
1409        token[:0] = [leader]
1410    local_part.append(token)
1411    if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1412        obs_local_part, value = get_obs_local_part(str(local_part) + value)
1413        if obs_local_part.token_type == 'invalid-obs-local-part':
1414            local_part.defects.append(errors.InvalidHeaderDefect(
1415                "local-part is not dot-atom, quoted-string, or obs-local-part"))
1416        else:
1417            local_part.defects.append(errors.ObsoleteHeaderDefect(
1418                "local-part is not a dot-atom (contains CFWS)"))
1419        local_part[0] = obs_local_part
1420    try:
1421        local_part.value.encode('ascii')
1422    except UnicodeEncodeError:
1423        local_part.defects.append(errors.NonASCIILocalPartDefect(
1424                "local-part contains non-ASCII characters)"))
1425    return local_part, value
1426
1427def get_obs_local_part(value):
1428    """ obs-local-part = word *("." word)
1429    """
1430    obs_local_part = ObsLocalPart()
1431    last_non_ws_was_dot = False
1432    while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
1433        if value[0] == '.':
1434            if last_non_ws_was_dot:
1435                obs_local_part.defects.append(errors.InvalidHeaderDefect(
1436                    "invalid repeated '.'"))
1437            obs_local_part.append(DOT)
1438            last_non_ws_was_dot = True
1439            value = value[1:]
1440            continue
1441        elif value[0]=='\\':
1442            obs_local_part.append(ValueTerminal(value[0],
1443                                                'misplaced-special'))
1444            value = value[1:]
1445            obs_local_part.defects.append(errors.InvalidHeaderDefect(
1446                "'\\' character outside of quoted-string/ccontent"))
1447            last_non_ws_was_dot = False
1448            continue
1449        if obs_local_part and obs_local_part[-1].token_type != 'dot':
1450            obs_local_part.defects.append(errors.InvalidHeaderDefect(
1451                "missing '.' between words"))
1452        try:
1453            token, value = get_word(value)
1454            last_non_ws_was_dot = False
1455        except errors.HeaderParseError:
1456            if value[0] not in CFWS_LEADER:
1457                raise
1458            token, value = get_cfws(value)
1459        obs_local_part.append(token)
1460    if (obs_local_part[0].token_type == 'dot' or
1461            obs_local_part[0].token_type=='cfws' and
1462            obs_local_part[1].token_type=='dot'):
1463        obs_local_part.defects.append(errors.InvalidHeaderDefect(
1464            "Invalid leading '.' in local part"))
1465    if (obs_local_part[-1].token_type == 'dot' or
1466            obs_local_part[-1].token_type=='cfws' and
1467            obs_local_part[-2].token_type=='dot'):
1468        obs_local_part.defects.append(errors.InvalidHeaderDefect(
1469            "Invalid trailing '.' in local part"))
1470    if obs_local_part.defects:
1471        obs_local_part.token_type = 'invalid-obs-local-part'
1472    return obs_local_part, value
1473
1474def get_dtext(value):
1475    r""" dtext = <printable ascii except \ [ ]> / obs-dtext
1476        obs-dtext = obs-NO-WS-CTL / quoted-pair
1477
1478    We allow anything except the excluded characters, but if we find any
1479    ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
1480    added to the token's defects list.  Quoted pairs are converted to their
1481    unquoted values, so what is returned is a ptext token, in this case a
1482    ValueTerminal.  If there were quoted-printables, an ObsoleteHeaderDefect is
1483    added to the returned token's defect list.
1484
1485    """
1486    ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
1487    ptext = ValueTerminal(ptext, 'ptext')
1488    if had_qp:
1489        ptext.defects.append(errors.ObsoleteHeaderDefect(
1490            "quoted printable found in domain-literal"))
1491    _validate_xtext(ptext)
1492    return ptext, value
1493
1494def _check_for_early_dl_end(value, domain_literal):
1495    if value:
1496        return False
1497    domain_literal.append(errors.InvalidHeaderDefect(
1498        "end of input inside domain-literal"))
1499    domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1500    return True
1501
1502def get_domain_literal(value):
1503    """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
1504
1505    """
1506    domain_literal = DomainLiteral()
1507    if value[0] in CFWS_LEADER:
1508        token, value = get_cfws(value)
1509        domain_literal.append(token)
1510    if not value:
1511        raise errors.HeaderParseError("expected domain-literal")
1512    if value[0] != '[':
1513        raise errors.HeaderParseError("expected '[' at start of domain-literal "
1514                "but found '{}'".format(value))
1515    value = value[1:]
1516    if _check_for_early_dl_end(value, domain_literal):
1517        return domain_literal, value
1518    domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
1519    if value[0] in WSP:
1520        token, value = get_fws(value)
1521        domain_literal.append(token)
1522    token, value = get_dtext(value)
1523    domain_literal.append(token)
1524    if _check_for_early_dl_end(value, domain_literal):
1525        return domain_literal, value
1526    if value[0] in WSP:
1527        token, value = get_fws(value)
1528        domain_literal.append(token)
1529    if _check_for_early_dl_end(value, domain_literal):
1530        return domain_literal, value
1531    if value[0] != ']':
1532        raise errors.HeaderParseError("expected ']' at end of domain-literal "
1533                "but found '{}'".format(value))
1534    domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
1535    value = value[1:]
1536    if value and value[0] in CFWS_LEADER:
1537        token, value = get_cfws(value)
1538        domain_literal.append(token)
1539    return domain_literal, value
1540
1541def get_domain(value):
1542    """ domain = dot-atom / domain-literal / obs-domain
1543        obs-domain = atom *("." atom))
1544
1545    """
1546    domain = Domain()
1547    leader = None
1548    if value[0] in CFWS_LEADER:
1549        leader, value = get_cfws(value)
1550    if not value:
1551        raise errors.HeaderParseError(
1552            "expected domain but found '{}'".format(value))
1553    if value[0] == '[':
1554        token, value = get_domain_literal(value)
1555        if leader is not None:
1556            token[:0] = [leader]
1557        domain.append(token)
1558        return domain, value
1559    try:
1560        token, value = get_dot_atom(value)
1561    except errors.HeaderParseError:
1562        token, value = get_atom(value)
1563    if leader is not None:
1564        token[:0] = [leader]
1565    domain.append(token)
1566    if value and value[0] == '.':
1567        domain.defects.append(errors.ObsoleteHeaderDefect(
1568            "domain is not a dot-atom (contains CFWS)"))
1569        if domain[0].token_type == 'dot-atom':
1570            domain[:] = domain[0]
1571        while value and value[0] == '.':
1572            domain.append(DOT)
1573            token, value = get_atom(value[1:])
1574            domain.append(token)
1575    return domain, value
1576
1577def get_addr_spec(value):
1578    """ addr-spec = local-part "@" domain
1579
1580    """
1581    addr_spec = AddrSpec()
1582    token, value = get_local_part(value)
1583    addr_spec.append(token)
1584    if not value or value[0] != '@':
1585        addr_spec.defects.append(errors.InvalidHeaderDefect(
1586            "add-spec local part with no domain"))
1587        return addr_spec, value
1588    addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
1589    token, value = get_domain(value[1:])
1590    addr_spec.append(token)
1591    return addr_spec, value
1592
1593def get_obs_route(value):
1594    """ obs-route = obs-domain-list ":"
1595        obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
1596
1597        Returns an obs-route token with the appropriate sub-tokens (that is,
1598        there is no obs-domain-list in the parse tree).
1599    """
1600    obs_route = ObsRoute()
1601    while value and (value[0]==',' or value[0] in CFWS_LEADER):
1602        if value[0] in CFWS_LEADER:
1603            token, value = get_cfws(value)
1604            obs_route.append(token)
1605        elif value[0] == ',':
1606            obs_route.append(ListSeparator)
1607            value = value[1:]
1608    if not value or value[0] != '@':
1609        raise errors.HeaderParseError(
1610            "expected obs-route domain but found '{}'".format(value))
1611    obs_route.append(RouteComponentMarker)
1612    token, value = get_domain(value[1:])
1613    obs_route.append(token)
1614    while value and value[0]==',':
1615        obs_route.append(ListSeparator)
1616        value = value[1:]
1617        if not value:
1618            break
1619        if value[0] in CFWS_LEADER:
1620            token, value = get_cfws(value)
1621            obs_route.append(token)
1622        if value[0] == '@':
1623            obs_route.append(RouteComponentMarker)
1624            token, value = get_domain(value[1:])
1625            obs_route.append(token)
1626    if not value:
1627        raise errors.HeaderParseError("end of header while parsing obs-route")
1628    if value[0] != ':':
1629        raise errors.HeaderParseError( "expected ':' marking end of "
1630            "obs-route but found '{}'".format(value))
1631    obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
1632    return obs_route, value[1:]
1633
1634def get_angle_addr(value):
1635    """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
1636        obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
1637
1638    """
1639    angle_addr = AngleAddr()
1640    if value[0] in CFWS_LEADER:
1641        token, value = get_cfws(value)
1642        angle_addr.append(token)
1643    if not value or value[0] != '<':
1644        raise errors.HeaderParseError(
1645            "expected angle-addr but found '{}'".format(value))
1646    angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
1647    value = value[1:]
1648    # Although it is not legal per RFC5322, SMTP uses '<>' in certain
1649    # circumstances.
1650    if value[0] == '>':
1651        angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1652        angle_addr.defects.append(errors.InvalidHeaderDefect(
1653            "null addr-spec in angle-addr"))
1654        value = value[1:]
1655        return angle_addr, value
1656    try:
1657        token, value = get_addr_spec(value)
1658    except errors.HeaderParseError:
1659        try:
1660            token, value = get_obs_route(value)
1661            angle_addr.defects.append(errors.ObsoleteHeaderDefect(
1662                "obsolete route specification in angle-addr"))
1663        except errors.HeaderParseError:
1664            raise errors.HeaderParseError(
1665                "expected addr-spec or obs-route but found '{}'".format(value))
1666        angle_addr.append(token)
1667        token, value = get_addr_spec(value)
1668    angle_addr.append(token)
1669    if value and value[0] == '>':
1670        value = value[1:]
1671    else:
1672        angle_addr.defects.append(errors.InvalidHeaderDefect(
1673            "missing trailing '>' on angle-addr"))
1674    angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
1675    if value and value[0] in CFWS_LEADER:
1676        token, value = get_cfws(value)
1677        angle_addr.append(token)
1678    return angle_addr, value
1679
1680def get_display_name(value):
1681    """ display-name = phrase
1682
1683    Because this is simply a name-rule, we don't return a display-name
1684    token containing a phrase, but rather a display-name token with
1685    the content of the phrase.
1686
1687    """
1688    display_name = DisplayName()
1689    token, value = get_phrase(value)
1690    display_name.extend(token[:])
1691    display_name.defects = token.defects[:]
1692    return display_name, value
1693
1694
1695def get_name_addr(value):
1696    """ name-addr = [display-name] angle-addr
1697
1698    """
1699    name_addr = NameAddr()
1700    # Both the optional display name and the angle-addr can start with cfws.
1701    leader = None
1702    if value[0] in CFWS_LEADER:
1703        leader, value = get_cfws(value)
1704        if not value:
1705            raise errors.HeaderParseError(
1706                "expected name-addr but found '{}'".format(leader))
1707    if value[0] != '<':
1708        if value[0] in PHRASE_ENDS:
1709            raise errors.HeaderParseError(
1710                "expected name-addr but found '{}'".format(value))
1711        token, value = get_display_name(value)
1712        if not value:
1713            raise errors.HeaderParseError(
1714                "expected name-addr but found '{}'".format(token))
1715        if leader is not None:
1716            token[0][:0] = [leader]
1717            leader = None
1718        name_addr.append(token)
1719    token, value = get_angle_addr(value)
1720    if leader is not None:
1721        token[:0] = [leader]
1722    name_addr.append(token)
1723    return name_addr, value
1724
1725def get_mailbox(value):
1726    """ mailbox = name-addr / addr-spec
1727
1728    """
1729    # The only way to figure out if we are dealing with a name-addr or an
1730    # addr-spec is to try parsing each one.
1731    mailbox = Mailbox()
1732    try:
1733        token, value = get_name_addr(value)
1734    except errors.HeaderParseError:
1735        try:
1736            token, value = get_addr_spec(value)
1737        except errors.HeaderParseError:
1738            raise errors.HeaderParseError(
1739                "expected mailbox but found '{}'".format(value))
1740    if any(isinstance(x, errors.InvalidHeaderDefect)
1741                       for x in token.all_defects):
1742        mailbox.token_type = 'invalid-mailbox'
1743    mailbox.append(token)
1744    return mailbox, value
1745
1746def get_invalid_mailbox(value, endchars):
1747    """ Read everything up to one of the chars in endchars.
1748
1749    This is outside the formal grammar.  The InvalidMailbox TokenList that is
1750    returned acts like a Mailbox, but the data attributes are None.
1751
1752    """
1753    invalid_mailbox = InvalidMailbox()
1754    while value and value[0] not in endchars:
1755        if value[0] in PHRASE_ENDS:
1756            invalid_mailbox.append(ValueTerminal(value[0],
1757                                                 'misplaced-special'))
1758            value = value[1:]
1759        else:
1760            token, value = get_phrase(value)
1761            invalid_mailbox.append(token)
1762    return invalid_mailbox, value
1763
1764def get_mailbox_list(value):
1765    """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
1766        obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
1767
1768    For this routine we go outside the formal grammar in order to improve error
1769    handling.  We recognize the end of the mailbox list only at the end of the
1770    value or at a ';' (the group terminator).  This is so that we can turn
1771    invalid mailboxes into InvalidMailbox tokens and continue parsing any
1772    remaining valid mailboxes.  We also allow all mailbox entries to be null,
1773    and this condition is handled appropriately at a higher level.
1774
1775    """
1776    mailbox_list = MailboxList()
1777    while value and value[0] != ';':
1778        try:
1779            token, value = get_mailbox(value)
1780            mailbox_list.append(token)
1781        except errors.HeaderParseError:
1782            leader = None
1783            if value[0] in CFWS_LEADER:
1784                leader, value = get_cfws(value)
1785                if not value or value[0] in ',;':
1786                    mailbox_list.append(leader)
1787                    mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1788                        "empty element in mailbox-list"))
1789                else:
1790                    token, value = get_invalid_mailbox(value, ',;')
1791                    if leader is not None:
1792                        token[:0] = [leader]
1793                    mailbox_list.append(token)
1794                    mailbox_list.defects.append(errors.InvalidHeaderDefect(
1795                        "invalid mailbox in mailbox-list"))
1796            elif value[0] == ',':
1797                mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
1798                    "empty element in mailbox-list"))
1799            else:
1800                token, value = get_invalid_mailbox(value, ',;')
1801                if leader is not None:
1802                    token[:0] = [leader]
1803                mailbox_list.append(token)
1804                mailbox_list.defects.append(errors.InvalidHeaderDefect(
1805                    "invalid mailbox in mailbox-list"))
1806        if value and value[0] not in ',;':
1807            # Crap after mailbox; treat it as an invalid mailbox.
1808            # The mailbox info will still be available.
1809            mailbox = mailbox_list[-1]
1810            mailbox.token_type = 'invalid-mailbox'
1811            token, value = get_invalid_mailbox(value, ',;')
1812            mailbox.extend(token)
1813            mailbox_list.defects.append(errors.InvalidHeaderDefect(
1814                "invalid mailbox in mailbox-list"))
1815        if value and value[0] == ',':
1816            mailbox_list.append(ListSeparator)
1817            value = value[1:]
1818    return mailbox_list, value
1819
1820
1821def get_group_list(value):
1822    """ group-list = mailbox-list / CFWS / obs-group-list
1823        obs-group-list = 1*([CFWS] ",") [CFWS]
1824
1825    """
1826    group_list = GroupList()
1827    if not value:
1828        group_list.defects.append(errors.InvalidHeaderDefect(
1829            "end of header before group-list"))
1830        return group_list, value
1831    leader = None
1832    if value and value[0] in CFWS_LEADER:
1833        leader, value = get_cfws(value)
1834        if not value:
1835            # This should never happen in email parsing, since CFWS-only is a
1836            # legal alternative to group-list in a group, which is the only
1837            # place group-list appears.
1838            group_list.defects.append(errors.InvalidHeaderDefect(
1839                "end of header in group-list"))
1840            group_list.append(leader)
1841            return group_list, value
1842        if value[0] == ';':
1843            group_list.append(leader)
1844            return group_list, value
1845    token, value = get_mailbox_list(value)
1846    if len(token.all_mailboxes)==0:
1847        if leader is not None:
1848            group_list.append(leader)
1849        group_list.extend(token)
1850        group_list.defects.append(errors.ObsoleteHeaderDefect(
1851            "group-list with empty entries"))
1852        return group_list, value
1853    if leader is not None:
1854        token[:0] = [leader]
1855    group_list.append(token)
1856    return group_list, value
1857
1858def get_group(value):
1859    """ group = display-name ":" [group-list] ";" [CFWS]
1860
1861    """
1862    group = Group()
1863    token, value = get_display_name(value)
1864    if not value or value[0] != ':':
1865        raise errors.HeaderParseError("expected ':' at end of group "
1866            "display name but found '{}'".format(value))
1867    group.append(token)
1868    group.append(ValueTerminal(':', 'group-display-name-terminator'))
1869    value = value[1:]
1870    if value and value[0] == ';':
1871        group.append(ValueTerminal(';', 'group-terminator'))
1872        return group, value[1:]
1873    token, value = get_group_list(value)
1874    group.append(token)
1875    if not value:
1876        group.defects.append(errors.InvalidHeaderDefect(
1877            "end of header in group"))
1878    elif value[0] != ';':
1879        raise errors.HeaderParseError(
1880            "expected ';' at end of group but found {}".format(value))
1881    group.append(ValueTerminal(';', 'group-terminator'))
1882    value = value[1:]
1883    if value and value[0] in CFWS_LEADER:
1884        token, value = get_cfws(value)
1885        group.append(token)
1886    return group, value
1887
1888def get_address(value):
1889    """ address = mailbox / group
1890
1891    Note that counter-intuitively, an address can be either a single address or
1892    a list of addresses (a group).  This is why the returned Address object has
1893    a 'mailboxes' attribute which treats a single address as a list of length
1894    one.  When you need to differentiate between to two cases, extract the single
1895    element, which is either a mailbox or a group token.
1896
1897    """
1898    # The formal grammar isn't very helpful when parsing an address.  mailbox
1899    # and group, especially when allowing for obsolete forms, start off very
1900    # similarly.  It is only when you reach one of @, <, or : that you know
1901    # what you've got.  So, we try each one in turn, starting with the more
1902    # likely of the two.  We could perhaps make this more efficient by looking
1903    # for a phrase and then branching based on the next character, but that
1904    # would be a premature optimization.
1905    address = Address()
1906    try:
1907        token, value = get_group(value)
1908    except errors.HeaderParseError:
1909        try:
1910            token, value = get_mailbox(value)
1911        except errors.HeaderParseError:
1912            raise errors.HeaderParseError(
1913                "expected address but found '{}'".format(value))
1914    address.append(token)
1915    return address, value
1916
1917def get_address_list(value):
1918    """ address_list = (address *("," address)) / obs-addr-list
1919        obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
1920
1921    We depart from the formal grammar here by continuing to parse until the end
1922    of the input, assuming the input to be entirely composed of an
1923    address-list.  This is always true in email parsing, and allows us
1924    to skip invalid addresses to parse additional valid ones.
1925
1926    """
1927    address_list = AddressList()
1928    while value:
1929        try:
1930            token, value = get_address(value)
1931            address_list.append(token)
1932        except errors.HeaderParseError as err:
1933            leader = None
1934            if value[0] in CFWS_LEADER:
1935                leader, value = get_cfws(value)
1936                if not value or value[0] == ',':
1937                    address_list.append(leader)
1938                    address_list.defects.append(errors.ObsoleteHeaderDefect(
1939                        "address-list entry with no content"))
1940                else:
1941                    token, value = get_invalid_mailbox(value, ',')
1942                    if leader is not None:
1943                        token[:0] = [leader]
1944                    address_list.append(Address([token]))
1945                    address_list.defects.append(errors.InvalidHeaderDefect(
1946                        "invalid address in address-list"))
1947            elif value[0] == ',':
1948                address_list.defects.append(errors.ObsoleteHeaderDefect(
1949                    "empty element in address-list"))
1950            else:
1951                token, value = get_invalid_mailbox(value, ',')
1952                if leader is not None:
1953                    token[:0] = [leader]
1954                address_list.append(Address([token]))
1955                address_list.defects.append(errors.InvalidHeaderDefect(
1956                    "invalid address in address-list"))
1957        if value and value[0] != ',':
1958            # Crap after address; treat it as an invalid mailbox.
1959            # The mailbox info will still be available.
1960            mailbox = address_list[-1][0]
1961            mailbox.token_type = 'invalid-mailbox'
1962            token, value = get_invalid_mailbox(value, ',')
1963            mailbox.extend(token)
1964            address_list.defects.append(errors.InvalidHeaderDefect(
1965                "invalid address in address-list"))
1966        if value:  # Must be a , at this point.
1967            address_list.append(ValueTerminal(',', 'list-separator'))
1968            value = value[1:]
1969    return address_list, value
1970
1971#
1972# XXX: As I begin to add additional header parsers, I'm realizing we probably
1973# have two level of parser routines: the get_XXX methods that get a token in
1974# the grammar, and parse_XXX methods that parse an entire field value.  So
1975# get_address_list above should really be a parse_ method, as probably should
1976# be get_unstructured.
1977#
1978
1979def parse_mime_version(value):
1980    """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
1981
1982    """
1983    # The [CFWS] is implicit in the RFC 2045 BNF.
1984    # XXX: This routine is a bit verbose, should factor out a get_int method.
1985    mime_version = MIMEVersion()
1986    if not value:
1987        mime_version.defects.append(errors.HeaderMissingRequiredValue(
1988            "Missing MIME version number (eg: 1.0)"))
1989        return mime_version
1990    if value[0] in CFWS_LEADER:
1991        token, value = get_cfws(value)
1992        mime_version.append(token)
1993        if not value:
1994            mime_version.defects.append(errors.HeaderMissingRequiredValue(
1995                "Expected MIME version number but found only CFWS"))
1996    digits = ''
1997    while value and value[0] != '.' and value[0] not in CFWS_LEADER:
1998        digits += value[0]
1999        value = value[1:]
2000    if not digits.isdigit():
2001        mime_version.defects.append(errors.InvalidHeaderDefect(
2002            "Expected MIME major version number but found {!r}".format(digits)))
2003        mime_version.append(ValueTerminal(digits, 'xtext'))
2004    else:
2005        mime_version.major = int(digits)
2006        mime_version.append(ValueTerminal(digits, 'digits'))
2007    if value and value[0] in CFWS_LEADER:
2008        token, value = get_cfws(value)
2009        mime_version.append(token)
2010    if not value or value[0] != '.':
2011        if mime_version.major is not None:
2012            mime_version.defects.append(errors.InvalidHeaderDefect(
2013                "Incomplete MIME version; found only major number"))
2014        if value:
2015            mime_version.append(ValueTerminal(value, 'xtext'))
2016        return mime_version
2017    mime_version.append(ValueTerminal('.', 'version-separator'))
2018    value = value[1:]
2019    if value and value[0] in CFWS_LEADER:
2020        token, value = get_cfws(value)
2021        mime_version.append(token)
2022    if not value:
2023        if mime_version.major is not None:
2024            mime_version.defects.append(errors.InvalidHeaderDefect(
2025                "Incomplete MIME version; found only major number"))
2026        return mime_version
2027    digits = ''
2028    while value and value[0] not in CFWS_LEADER:
2029        digits += value[0]
2030        value = value[1:]
2031    if not digits.isdigit():
2032        mime_version.defects.append(errors.InvalidHeaderDefect(
2033            "Expected MIME minor version number but found {!r}".format(digits)))
2034        mime_version.append(ValueTerminal(digits, 'xtext'))
2035    else:
2036        mime_version.minor = int(digits)
2037        mime_version.append(ValueTerminal(digits, 'digits'))
2038    if value and value[0] in CFWS_LEADER:
2039        token, value = get_cfws(value)
2040        mime_version.append(token)
2041    if value:
2042        mime_version.defects.append(errors.InvalidHeaderDefect(
2043            "Excess non-CFWS text after MIME version"))
2044        mime_version.append(ValueTerminal(value, 'xtext'))
2045    return mime_version
2046
2047def get_invalid_parameter(value):
2048    """ Read everything up to the next ';'.
2049
2050    This is outside the formal grammar.  The InvalidParameter TokenList that is
2051    returned acts like a Parameter, but the data attributes are None.
2052
2053    """
2054    invalid_parameter = InvalidParameter()
2055    while value and value[0] != ';':
2056        if value[0] in PHRASE_ENDS:
2057            invalid_parameter.append(ValueTerminal(value[0],
2058                                                   'misplaced-special'))
2059            value = value[1:]
2060        else:
2061            token, value = get_phrase(value)
2062            invalid_parameter.append(token)
2063    return invalid_parameter, value
2064
2065def get_ttext(value):
2066    """ttext = <matches _ttext_matcher>
2067
2068    We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
2069    defects list if we find non-ttext characters.  We also register defects for
2070    *any* non-printables even though the RFC doesn't exclude all of them,
2071    because we follow the spirit of RFC 5322.
2072
2073    """
2074    m = _non_token_end_matcher(value)
2075    if not m:
2076        raise errors.HeaderParseError(
2077            "expected ttext but found '{}'".format(value))
2078    ttext = m.group()
2079    value = value[len(ttext):]
2080    ttext = ValueTerminal(ttext, 'ttext')
2081    _validate_xtext(ttext)
2082    return ttext, value
2083
2084def get_token(value):
2085    """token = [CFWS] 1*ttext [CFWS]
2086
2087    The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
2088    tspecials.  We also exclude tabs even though the RFC doesn't.
2089
2090    The RFC implies the CFWS but is not explicit about it in the BNF.
2091
2092    """
2093    mtoken = Token()
2094    if value and value[0] in CFWS_LEADER:
2095        token, value = get_cfws(value)
2096        mtoken.append(token)
2097    if value and value[0] in TOKEN_ENDS:
2098        raise errors.HeaderParseError(
2099            "expected token but found '{}'".format(value))
2100    token, value = get_ttext(value)
2101    mtoken.append(token)
2102    if value and value[0] in CFWS_LEADER:
2103        token, value = get_cfws(value)
2104        mtoken.append(token)
2105    return mtoken, value
2106
2107def get_attrtext(value):
2108    """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
2109
2110    We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
2111    token's defects list if we find non-attrtext characters.  We also register
2112    defects for *any* non-printables even though the RFC doesn't exclude all of
2113    them, because we follow the spirit of RFC 5322.
2114
2115    """
2116    m = _non_attribute_end_matcher(value)
2117    if not m:
2118        raise errors.HeaderParseError(
2119            "expected attrtext but found {!r}".format(value))
2120    attrtext = m.group()
2121    value = value[len(attrtext):]
2122    attrtext = ValueTerminal(attrtext, 'attrtext')
2123    _validate_xtext(attrtext)
2124    return attrtext, value
2125
2126def get_attribute(value):
2127    """ [CFWS] 1*attrtext [CFWS]
2128
2129    This version of the BNF makes the CFWS explicit, and as usual we use a
2130    value terminal for the actual run of characters.  The RFC equivalent of
2131    attrtext is the token characters, with the subtraction of '*', "'", and '%'.
2132    We include tab in the excluded set just as we do for token.
2133
2134    """
2135    attribute = Attribute()
2136    if value and value[0] in CFWS_LEADER:
2137        token, value = get_cfws(value)
2138        attribute.append(token)
2139    if value and value[0] in ATTRIBUTE_ENDS:
2140        raise errors.HeaderParseError(
2141            "expected token but found '{}'".format(value))
2142    token, value = get_attrtext(value)
2143    attribute.append(token)
2144    if value and value[0] in CFWS_LEADER:
2145        token, value = get_cfws(value)
2146        attribute.append(token)
2147    return attribute, value
2148
2149def get_extended_attrtext(value):
2150    """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
2151
2152    This is a special parsing routine so that we get a value that
2153    includes % escapes as a single string (which we decode as a single
2154    string later).
2155
2156    """
2157    m = _non_extended_attribute_end_matcher(value)
2158    if not m:
2159        raise errors.HeaderParseError(
2160            "expected extended attrtext but found {!r}".format(value))
2161    attrtext = m.group()
2162    value = value[len(attrtext):]
2163    attrtext = ValueTerminal(attrtext, 'extended-attrtext')
2164    _validate_xtext(attrtext)
2165    return attrtext, value
2166
2167def get_extended_attribute(value):
2168    """ [CFWS] 1*extended_attrtext [CFWS]
2169
2170    This is like the non-extended version except we allow % characters, so that
2171    we can pick up an encoded value as a single string.
2172
2173    """
2174    # XXX: should we have an ExtendedAttribute TokenList?
2175    attribute = Attribute()
2176    if value and value[0] in CFWS_LEADER:
2177        token, value = get_cfws(value)
2178        attribute.append(token)
2179    if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
2180        raise errors.HeaderParseError(
2181            "expected token but found '{}'".format(value))
2182    token, value = get_extended_attrtext(value)
2183    attribute.append(token)
2184    if value and value[0] in CFWS_LEADER:
2185        token, value = get_cfws(value)
2186        attribute.append(token)
2187    return attribute, value
2188
2189def get_section(value):
2190    """ '*' digits
2191
2192    The formal BNF is more complicated because leading 0s are not allowed.  We
2193    check for that and add a defect.  We also assume no CFWS is allowed between
2194    the '*' and the digits, though the RFC is not crystal clear on that.
2195    The caller should already have dealt with leading CFWS.
2196
2197    """
2198    section = Section()
2199    if not value or value[0] != '*':
2200        raise errors.HeaderParseError("Expected section but found {}".format(
2201                                        value))
2202    section.append(ValueTerminal('*', 'section-marker'))
2203    value = value[1:]
2204    if not value or not value[0].isdigit():
2205        raise errors.HeaderParseError("Expected section number but "
2206                                      "found {}".format(value))
2207    digits = ''
2208    while value and value[0].isdigit():
2209        digits += value[0]
2210        value = value[1:]
2211    if digits[0] == '0' and digits != '0':
2212        section.defects.append(errors.InvalidHeaderError(
2213                "section number has an invalid leading 0"))
2214    section.number = int(digits)
2215    section.append(ValueTerminal(digits, 'digits'))
2216    return section, value
2217
2218
2219def get_value(value):
2220    """ quoted-string / attribute
2221
2222    """
2223    v = Value()
2224    if not value:
2225        raise errors.HeaderParseError("Expected value but found end of string")
2226    leader = None
2227    if value[0] in CFWS_LEADER:
2228        leader, value = get_cfws(value)
2229    if not value:
2230        raise errors.HeaderParseError("Expected value but found "
2231                                      "only {}".format(leader))
2232    if value[0] == '"':
2233        token, value = get_quoted_string(value)
2234    else:
2235        token, value = get_extended_attribute(value)
2236    if leader is not None:
2237        token[:0] = [leader]
2238    v.append(token)
2239    return v, value
2240
2241def get_parameter(value):
2242    """ attribute [section] ["*"] [CFWS] "=" value
2243
2244    The CFWS is implied by the RFC but not made explicit in the BNF.  This
2245    simplified form of the BNF from the RFC is made to conform with the RFC BNF
2246    through some extra checks.  We do it this way because it makes both error
2247    recovery and working with the resulting parse tree easier.
2248    """
2249    # It is possible CFWS would also be implicitly allowed between the section
2250    # and the 'extended-attribute' marker (the '*') , but we've never seen that
2251    # in the wild and we will therefore ignore the possibility.
2252    param = Parameter()
2253    token, value = get_attribute(value)
2254    param.append(token)
2255    if not value or value[0] == ';':
2256        param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
2257            "name ({}) but no value".format(token)))
2258        return param, value
2259    if value[0] == '*':
2260        try:
2261            token, value = get_section(value)
2262            param.sectioned = True
2263            param.append(token)
2264        except errors.HeaderParseError:
2265            pass
2266        if not value:
2267            raise errors.HeaderParseError("Incomplete parameter")
2268        if value[0] == '*':
2269            param.append(ValueTerminal('*', 'extended-parameter-marker'))
2270            value = value[1:]
2271            param.extended = True
2272    if value[0] != '=':
2273        raise errors.HeaderParseError("Parameter not followed by '='")
2274    param.append(ValueTerminal('=', 'parameter-separator'))
2275    value = value[1:]
2276    leader = None
2277    if value and value[0] in CFWS_LEADER:
2278        token, value = get_cfws(value)
2279        param.append(token)
2280    remainder = None
2281    appendto = param
2282    if param.extended and value and value[0] == '"':
2283        # Now for some serious hackery to handle the common invalid case of
2284        # double quotes around an extended value.  We also accept (with defect)
2285        # a value marked as encoded that isn't really.
2286        qstring, remainder = get_quoted_string(value)
2287        inner_value = qstring.stripped_value
2288        semi_valid = False
2289        if param.section_number == 0:
2290            if inner_value and inner_value[0] == "'":
2291                semi_valid = True
2292            else:
2293                token, rest = get_attrtext(inner_value)
2294                if rest and rest[0] == "'":
2295                    semi_valid = True
2296        else:
2297            try:
2298                token, rest = get_extended_attrtext(inner_value)
2299            except:
2300                pass
2301            else:
2302                if not rest:
2303                    semi_valid = True
2304        if semi_valid:
2305            param.defects.append(errors.InvalidHeaderDefect(
2306                "Quoted string value for extended parameter is invalid"))
2307            param.append(qstring)
2308            for t in qstring:
2309                if t.token_type == 'bare-quoted-string':
2310                    t[:] = []
2311                    appendto = t
2312                    break
2313            value = inner_value
2314        else:
2315            remainder = None
2316            param.defects.append(errors.InvalidHeaderDefect(
2317                "Parameter marked as extended but appears to have a "
2318                "quoted string value that is non-encoded"))
2319    if value and value[0] == "'":
2320        token = None
2321    else:
2322        token, value = get_value(value)
2323    if not param.extended or param.section_number > 0:
2324        if not value or value[0] != "'":
2325            appendto.append(token)
2326            if remainder is not None:
2327                assert not value, value
2328                value = remainder
2329            return param, value
2330        param.defects.append(errors.InvalidHeaderDefect(
2331            "Apparent initial-extended-value but attribute "
2332            "was not marked as extended or was not initial section"))
2333    if not value:
2334        # Assume the charset/lang is missing and the token is the value.
2335        param.defects.append(errors.InvalidHeaderDefect(
2336            "Missing required charset/lang delimiters"))
2337        appendto.append(token)
2338        if remainder is None:
2339            return param, value
2340    else:
2341        if token is not None:
2342            for t in token:
2343                if t.token_type == 'extended-attrtext':
2344                    break
2345            t.token_type == 'attrtext'
2346            appendto.append(t)
2347            param.charset = t.value
2348        if value[0] != "'":
2349            raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2350                                          "delimiter, but found {!r}".format(value))
2351        appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
2352        value = value[1:]
2353        if value and value[0] != "'":
2354            token, value = get_attrtext(value)
2355            appendto.append(token)
2356            param.lang = token.value
2357            if not value or value[0] != "'":
2358                raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
2359                                  "delimiter, but found {}".format(value))
2360        appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
2361        value = value[1:]
2362    if remainder is not None:
2363        # Treat the rest of value as bare quoted string content.
2364        v = Value()
2365        while value:
2366            if value[0] in WSP:
2367                token, value = get_fws(value)
2368            else:
2369                token, value = get_qcontent(value)
2370            v.append(token)
2371        token = v
2372    else:
2373        token, value = get_value(value)
2374    appendto.append(token)
2375    if remainder is not None:
2376        assert not value, value
2377        value = remainder
2378    return param, value
2379
2380def parse_mime_parameters(value):
2381    """ parameter *( ";" parameter )
2382
2383    That BNF is meant to indicate this routine should only be called after
2384    finding and handling the leading ';'.  There is no corresponding rule in
2385    the formal RFC grammar, but it is more convenient for us for the set of
2386    parameters to be treated as its own TokenList.
2387
2388    This is 'parse' routine because it consumes the reminaing value, but it
2389    would never be called to parse a full header.  Instead it is called to
2390    parse everything after the non-parameter value of a specific MIME header.
2391
2392    """
2393    mime_parameters = MimeParameters()
2394    while value:
2395        try:
2396            token, value = get_parameter(value)
2397            mime_parameters.append(token)
2398        except errors.HeaderParseError as err:
2399            leader = None
2400            if value[0] in CFWS_LEADER:
2401                leader, value = get_cfws(value)
2402            if not value:
2403                mime_parameters.append(leader)
2404                return mime_parameters
2405            if value[0] == ';':
2406                if leader is not None:
2407                    mime_parameters.append(leader)
2408                mime_parameters.defects.append(errors.InvalidHeaderDefect(
2409                    "parameter entry with no content"))
2410            else:
2411                token, value = get_invalid_parameter(value)
2412                if leader:
2413                    token[:0] = [leader]
2414                mime_parameters.append(token)
2415                mime_parameters.defects.append(errors.InvalidHeaderDefect(
2416                    "invalid parameter {!r}".format(token)))
2417        if value and value[0] != ';':
2418            # Junk after the otherwise valid parameter.  Mark it as
2419            # invalid, but it will have a value.
2420            param = mime_parameters[-1]
2421            param.token_type = 'invalid-parameter'
2422            token, value = get_invalid_parameter(value)
2423            param.extend(token)
2424            mime_parameters.defects.append(errors.InvalidHeaderDefect(
2425                "parameter with invalid trailing text {!r}".format(token)))
2426        if value:
2427            # Must be a ';' at this point.
2428            mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
2429            value = value[1:]
2430    return mime_parameters
2431
2432def _find_mime_parameters(tokenlist, value):
2433    """Do our best to find the parameters in an invalid MIME header
2434
2435    """
2436    while value and value[0] != ';':
2437        if value[0] in PHRASE_ENDS:
2438            tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
2439            value = value[1:]
2440        else:
2441            token, value = get_phrase(value)
2442            tokenlist.append(token)
2443    if not value:
2444        return
2445    tokenlist.append(ValueTerminal(';', 'parameter-separator'))
2446    tokenlist.append(parse_mime_parameters(value[1:]))
2447
2448def parse_content_type_header(value):
2449    """ maintype "/" subtype *( ";" parameter )
2450
2451    The maintype and substype are tokens.  Theoretically they could
2452    be checked against the official IANA list + x-token, but we
2453    don't do that.
2454    """
2455    ctype = ContentType()
2456    recover = False
2457    if not value:
2458        ctype.defects.append(errors.HeaderMissingRequiredValue(
2459            "Missing content type specification"))
2460        return ctype
2461    try:
2462        token, value = get_token(value)
2463    except errors.HeaderParseError:
2464        ctype.defects.append(errors.InvalidHeaderDefect(
2465            "Expected content maintype but found {!r}".format(value)))
2466        _find_mime_parameters(ctype, value)
2467        return ctype
2468    ctype.append(token)
2469    # XXX: If we really want to follow the formal grammar we should make
2470    # mantype and subtype specialized TokenLists here.  Probably not worth it.
2471    if not value or value[0] != '/':
2472        ctype.defects.append(errors.InvalidHeaderDefect(
2473            "Invalid content type"))
2474        if value:
2475            _find_mime_parameters(ctype, value)
2476        return ctype
2477    ctype.maintype = token.value.strip().lower()
2478    ctype.append(ValueTerminal('/', 'content-type-separator'))
2479    value = value[1:]
2480    try:
2481        token, value = get_token(value)
2482    except errors.HeaderParseError:
2483        ctype.defects.append(errors.InvalidHeaderDefect(
2484            "Expected content subtype but found {!r}".format(value)))
2485        _find_mime_parameters(ctype, value)
2486        return ctype
2487    ctype.append(token)
2488    ctype.subtype = token.value.strip().lower()
2489    if not value:
2490        return ctype
2491    if value[0] != ';':
2492        ctype.defects.append(errors.InvalidHeaderDefect(
2493            "Only parameters are valid after content type, but "
2494            "found {!r}".format(value)))
2495        # The RFC requires that a syntactically invalid content-type be treated
2496        # as text/plain.  Perhaps we should postel this, but we should probably
2497        # only do that if we were checking the subtype value against IANA.
2498        del ctype.maintype, ctype.subtype
2499        _find_mime_parameters(ctype, value)
2500        return ctype
2501    ctype.append(ValueTerminal(';', 'parameter-separator'))
2502    ctype.append(parse_mime_parameters(value[1:]))
2503    return ctype
2504
2505def parse_content_disposition_header(value):
2506    """ disposition-type *( ";" parameter )
2507
2508    """
2509    disp_header = ContentDisposition()
2510    if not value:
2511        disp_header.defects.append(errors.HeaderMissingRequiredValue(
2512            "Missing content disposition"))
2513        return disp_header
2514    try:
2515        token, value = get_token(value)
2516    except errors.HeaderParseError:
2517        disp_header.defects.append(errors.InvalidHeaderDefect(
2518            "Expected content disposition but found {!r}".format(value)))
2519        _find_mime_parameters(disp_header, value)
2520        return disp_header
2521    disp_header.append(token)
2522    disp_header.content_disposition = token.value.strip().lower()
2523    if not value:
2524        return disp_header
2525    if value[0] != ';':
2526        disp_header.defects.append(errors.InvalidHeaderDefect(
2527            "Only parameters are valid after content disposition, but "
2528            "found {!r}".format(value)))
2529        _find_mime_parameters(disp_header, value)
2530        return disp_header
2531    disp_header.append(ValueTerminal(';', 'parameter-separator'))
2532    disp_header.append(parse_mime_parameters(value[1:]))
2533    return disp_header
2534
2535def parse_content_transfer_encoding_header(value):
2536    """ mechanism
2537
2538    """
2539    # We should probably validate the values, since the list is fixed.
2540    cte_header = ContentTransferEncoding()
2541    if not value:
2542        cte_header.defects.append(errors.HeaderMissingRequiredValue(
2543            "Missing content transfer encoding"))
2544        return cte_header
2545    try:
2546        token, value = get_token(value)
2547    except errors.HeaderParseError:
2548        cte_header.defects.append(errors.InvalidHeaderDefect(
2549            "Expected content transfer encoding but found {!r}".format(value)))
2550    else:
2551        cte_header.append(token)
2552        cte_header.cte = token.value.strip().lower()
2553    if not value:
2554        return cte_header
2555    while value:
2556        cte_header.defects.append(errors.InvalidHeaderDefect(
2557            "Extra text after content transfer encoding"))
2558        if value[0] in PHRASE_ENDS:
2559            cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
2560            value = value[1:]
2561        else:
2562            token, value = get_phrase(value)
2563            cte_header.append(token)
2564    return cte_header
2565
2566
2567#
2568# Header folding
2569#
2570# Header folding is complex, with lots of rules and corner cases.  The
2571# following code does its best to obey the rules and handle the corner
2572# cases, but you can be sure there are few bugs:)
2573#
2574# This folder generally canonicalizes as it goes, preferring the stringified
2575# version of each token.  The tokens contain information that supports the
2576# folder, including which tokens can be encoded in which ways.
2577#
2578# Folded text is accumulated in a simple list of strings ('lines'), each
2579# one of which should be less than policy.max_line_length ('maxlen').
2580#
2581
2582def _steal_trailing_WSP_if_exists(lines):
2583    wsp = ''
2584    if lines and lines[-1] and lines[-1][-1] in WSP:
2585        wsp = lines[-1][-1]
2586        lines[-1] = lines[-1][:-1]
2587    return wsp
2588
2589def _refold_parse_tree(parse_tree, *, policy):
2590    """Return string of contents of parse_tree folded according to RFC rules.
2591
2592    """
2593    # max_line_length 0/None means no limit, ie: infinitely long.
2594    maxlen = policy.max_line_length or float("+inf")
2595    encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2596    lines = ['']
2597    last_ew = None
2598    wrap_as_ew_blocked = 0
2599    want_encoding = False
2600    end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
2601    parts = list(parse_tree)
2602    while parts:
2603        part = parts.pop(0)
2604        if part is end_ew_not_allowed:
2605            wrap_as_ew_blocked -= 1
2606            continue
2607        tstr = str(part)
2608        try:
2609            tstr.encode(encoding)
2610            charset = encoding
2611        except UnicodeEncodeError:
2612            if any(isinstance(x, errors.UndecodableBytesDefect)
2613                   for x in part.all_defects):
2614                charset = 'unknown-8bit'
2615            else:
2616                # If policy.utf8 is false this should really be taken from a
2617                # 'charset' property on the policy.
2618                charset = 'utf-8'
2619            want_encoding = True
2620        if part.token_type == 'mime-parameters':
2621            # Mime parameter folding (using RFC2231) is extra special.
2622            _fold_mime_parameters(part, lines, maxlen, encoding)
2623            continue
2624        if want_encoding and not wrap_as_ew_blocked:
2625            if not part.as_ew_allowed:
2626                want_encoding = False
2627                last_ew = None
2628                if part.syntactic_break:
2629                    encoded_part = part.fold(policy=policy)[:-1] # strip nl
2630                    if policy.linesep not in encoded_part:
2631                        # It fits on a single line
2632                        if len(encoded_part) > maxlen - len(lines[-1]):
2633                            # But not on this one, so start a new one.
2634                            newline = _steal_trailing_WSP_if_exists(lines)
2635                            # XXX what if encoded_part has no leading FWS?
2636                            lines.append(newline)
2637                        lines[-1] += encoded_part
2638                        continue
2639                # Either this is not a major syntactic break, so we don't
2640                # want it on a line by itself even if it fits, or it
2641                # doesn't fit on a line by itself.  Either way, fall through
2642                # to unpacking the subparts and wrapping them.
2643            if not hasattr(part, 'encode'):
2644                # It's not a Terminal, do each piece individually.
2645                parts = list(part) + parts
2646            else:
2647                # It's a terminal, wrap it as an encoded word, possibly
2648                # combining it with previously encoded words if allowed.
2649                last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2650                                      part.ew_combine_allowed, charset)
2651            want_encoding = False
2652            continue
2653        if len(tstr) <= maxlen - len(lines[-1]):
2654            lines[-1] += tstr
2655            continue
2656        # This part is too long to fit.  The RFC wants us to break at
2657        # "major syntactic breaks", so unless we don't consider this
2658        # to be one, check if it will fit on the next line by itself.
2659        if (part.syntactic_break and
2660                len(tstr) + 1 <= maxlen):
2661            newline = _steal_trailing_WSP_if_exists(lines)
2662            if newline or part.startswith_fws():
2663                lines.append(newline + tstr)
2664                continue
2665        if not hasattr(part, 'encode'):
2666            # It's not a terminal, try folding the subparts.
2667            newparts = list(part)
2668            if not part.as_ew_allowed:
2669                wrap_as_ew_blocked += 1
2670                newparts.append(end_ew_not_allowed)
2671            parts = newparts + parts
2672            continue
2673        if part.as_ew_allowed and not wrap_as_ew_blocked:
2674            # It doesn't need CTE encoding, but encode it anyway so we can
2675            # wrap it.
2676            parts.insert(0, part)
2677            want_encoding = True
2678            continue
2679        # We can't figure out how to wrap, it, so give up.
2680        newline = _steal_trailing_WSP_if_exists(lines)
2681        if newline or part.startswith_fws():
2682            lines.append(newline + tstr)
2683        else:
2684            # We can't fold it onto the next line either...
2685            lines[-1] += tstr
2686    return policy.linesep.join(lines) + policy.linesep
2687
2688def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2689    """Fold string to_encode into lines as encoded word, combining if allowed.
2690    Return the new value for last_ew, or None if ew_combine_allowed is False.
2691
2692    If there is already an encoded word in the last line of lines (indicated by
2693    a non-None value for last_ew) and ew_combine_allowed is true, decode the
2694    existing ew, combine it with to_encode, and re-encode.  Otherwise, encode
2695    to_encode.  In either case, split to_encode as necessary so that the
2696    encoded segments fit within maxlen.
2697
2698    """
2699    if last_ew is not None and ew_combine_allowed:
2700        to_encode = str(
2701            get_unstructured(lines[-1][last_ew:] + to_encode))
2702        lines[-1] = lines[-1][:last_ew]
2703    if to_encode[0] in WSP:
2704        # We're joining this to non-encoded text, so don't encode
2705        # the leading blank.
2706        leading_wsp = to_encode[0]
2707        to_encode = to_encode[1:]
2708        if (len(lines[-1]) == maxlen):
2709            lines.append(_steal_trailing_WSP_if_exists(lines))
2710        lines[-1] += leading_wsp
2711    trailing_wsp = ''
2712    if to_encode[-1] in WSP:
2713        # Likewise for the trailing space.
2714        trailing_wsp = to_encode[-1]
2715        to_encode = to_encode[:-1]
2716    new_last_ew = len(lines[-1]) if last_ew is None else last_ew
2717    while to_encode:
2718        remaining_space = maxlen - len(lines[-1])
2719        # The RFC2047 chrome takes up 7 characters plus the length
2720        # of the charset name.
2721        encode_as = 'utf-8' if charset == 'us-ascii' else charset
2722        text_space = remaining_space - len(encode_as) - 7
2723        if text_space <= 0:
2724            lines.append(' ')
2725            # XXX We'll get an infinite loop here if maxlen is <= 7
2726            continue
2727        first_part = to_encode[:text_space]
2728        ew = _ew.encode(first_part, charset=encode_as)
2729        excess = len(ew) - remaining_space
2730        if excess > 0:
2731            # encode always chooses the shortest encoding, so this
2732            # is guaranteed to fit at this point.
2733            first_part = first_part[:-excess]
2734            ew = _ew.encode(first_part)
2735        lines[-1] += ew
2736        to_encode = to_encode[len(first_part):]
2737        if to_encode:
2738            lines.append(' ')
2739            new_last_ew = len(lines[-1])
2740    lines[-1] += trailing_wsp
2741    return new_last_ew if ew_combine_allowed else None
2742
2743def _fold_mime_parameters(part, lines, maxlen, encoding):
2744    """Fold TokenList 'part' into the 'lines' list as mime parameters.
2745
2746    Using the decoded list of parameters and values, format them according to
2747    the RFC rules, including using RFC2231 encoding if the value cannot be
2748    expressed in 'encoding' and/or the parameter+value is too long to fit
2749    within 'maxlen'.
2750
2751    """
2752    # Special case for RFC2231 encoding: start from decoded values and use
2753    # RFC2231 encoding iff needed.
2754    #
2755    # Note that the 1 and 2s being added to the length calculations are
2756    # accounting for the possibly-needed spaces and semicolons we'll be adding.
2757    #
2758    for name, value in part.params:
2759        # XXX What if this ';' puts us over maxlen the first time through the
2760        # loop?  We should split the header value onto a newline in that case,
2761        # but to do that we need to recognize the need earlier or reparse the
2762        # header, so I'm going to ignore that bug for now.  It'll only put us
2763        # one character over.
2764        if not lines[-1].rstrip().endswith(';'):
2765            lines[-1] += ';'
2766        charset = encoding
2767        error_handler = 'strict'
2768        try:
2769            value.encode(encoding)
2770            encoding_required = False
2771        except UnicodeEncodeError:
2772            encoding_required = True
2773            if utils._has_surrogates(value):
2774                charset = 'unknown-8bit'
2775                error_handler = 'surrogateescape'
2776            else:
2777                charset = 'utf-8'
2778        if encoding_required:
2779            encoded_value = urllib.parse.quote(
2780                value, safe='', errors=error_handler)
2781            tstr = "{}*={}''{}".format(name, charset, encoded_value)
2782        else:
2783            tstr = '{}={}'.format(name, quote_string(value))
2784        if len(lines[-1]) + len(tstr) + 1 < maxlen:
2785            lines[-1] = lines[-1] + ' ' + tstr
2786            continue
2787        elif len(tstr) + 2 <= maxlen:
2788            lines.append(' ' + tstr)
2789            continue
2790        # We need multiple sections.  We are allowed to mix encoded and
2791        # non-encoded sections, but we aren't going to.  We'll encode them all.
2792        section = 0
2793        extra_chrome = charset + "''"
2794        while value:
2795            chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
2796            if maxlen <= chrome_len + 3:
2797                # We need room for the leading blank, the trailing semicolon,
2798                # and at least one character of the value.  If we don't
2799                # have that, we'd be stuck, so in that case fall back to
2800                # the RFC standard width.
2801                maxlen = 78
2802            splitpoint = maxchars = maxlen - chrome_len - 2
2803            while True:
2804                partial = value[:splitpoint]
2805                encoded_value = urllib.parse.quote(
2806                    partial, safe='', errors=error_handler)
2807                if len(encoded_value) <= maxchars:
2808                    break
2809                splitpoint -= 1
2810            lines.append(" {}*{}*={}{}".format(
2811                name, section, extra_chrome, encoded_value))
2812            extra_chrome = ''
2813            section += 1
2814            value = value[splitpoint:]
2815            if value:
2816                lines[-1] += ';'
2817