1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
3import stringprep, re, codecs
4from unicodedata import ucd_3_2_0 as unicodedata
5
6# IDNA section 3.1
7dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
8
9# IDNA section 5
10ace_prefix = b"xn--"
11sace_prefix = "xn--"
12
13# This assumes query strings, so AllowUnassigned is true
14def nameprep(label):
15    # Map
16    newlabel = []
17    for c in label:
18        if stringprep.in_table_b1(c):
19            # Map to nothing
20            continue
21        newlabel.append(stringprep.map_table_b2(c))
22    label = "".join(newlabel)
23
24    # Normalize
25    label = unicodedata.normalize("NFKC", label)
26
27    # Prohibit
28    for c in label:
29        if stringprep.in_table_c12(c) or \
30           stringprep.in_table_c22(c) or \
31           stringprep.in_table_c3(c) or \
32           stringprep.in_table_c4(c) or \
33           stringprep.in_table_c5(c) or \
34           stringprep.in_table_c6(c) or \
35           stringprep.in_table_c7(c) or \
36           stringprep.in_table_c8(c) or \
37           stringprep.in_table_c9(c):
38            raise UnicodeError("Invalid character %r" % c)
39
40    # Check bidi
41    RandAL = [stringprep.in_table_d1(x) for x in label]
42    for c in RandAL:
43        if c:
44            # There is a RandAL char in the string. Must perform further
45            # tests:
46            # 1) The characters in section 5.8 MUST be prohibited.
47            # This is table C.8, which was already checked
48            # 2) If a string contains any RandALCat character, the string
49            # MUST NOT contain any LCat character.
50            if any(stringprep.in_table_d2(x) for x in label):
51                raise UnicodeError("Violation of BIDI requirement 2")
52
53            # 3) If a string contains any RandALCat character, a
54            # RandALCat character MUST be the first character of the
55            # string, and a RandALCat character MUST be the last
56            # character of the string.
57            if not RandAL[0] or not RandAL[-1]:
58                raise UnicodeError("Violation of BIDI requirement 3")
59
60    return label
61
62def ToASCII(label):
63    try:
64        # Step 1: try ASCII
65        label = label.encode("ascii")
66    except UnicodeError:
67        pass
68    else:
69        # Skip to step 3: UseSTD3ASCIIRules is false, so
70        # Skip to step 8.
71        if 0 < len(label) < 64:
72            return label
73        raise UnicodeError("label empty or too long")
74
75    # Step 2: nameprep
76    label = nameprep(label)
77
78    # Step 3: UseSTD3ASCIIRules is false
79    # Step 4: try ASCII
80    try:
81        label = label.encode("ascii")
82    except UnicodeError:
83        pass
84    else:
85        # Skip to step 8.
86        if 0 < len(label) < 64:
87            return label
88        raise UnicodeError("label empty or too long")
89
90    # Step 5: Check ACE prefix
91    if label.startswith(sace_prefix):
92        raise UnicodeError("Label starts with ACE prefix")
93
94    # Step 6: Encode with PUNYCODE
95    label = label.encode("punycode")
96
97    # Step 7: Prepend ACE prefix
98    label = ace_prefix + label
99
100    # Step 8: Check size
101    if 0 < len(label) < 64:
102        return label
103    raise UnicodeError("label empty or too long")
104
105def ToUnicode(label):
106    # Step 1: Check for ASCII
107    if isinstance(label, bytes):
108        pure_ascii = True
109    else:
110        try:
111            label = label.encode("ascii")
112            pure_ascii = True
113        except UnicodeError:
114            pure_ascii = False
115    if not pure_ascii:
116        # Step 2: Perform nameprep
117        label = nameprep(label)
118        # It doesn't say this, but apparently, it should be ASCII now
119        try:
120            label = label.encode("ascii")
121        except UnicodeError:
122            raise UnicodeError("Invalid character in IDN label")
123    # Step 3: Check for ACE prefix
124    if not label.startswith(ace_prefix):
125        return str(label, "ascii")
126
127    # Step 4: Remove ACE prefix
128    label1 = label[len(ace_prefix):]
129
130    # Step 5: Decode using PUNYCODE
131    result = label1.decode("punycode")
132
133    # Step 6: Apply ToASCII
134    label2 = ToASCII(result)
135
136    # Step 7: Compare the result of step 6 with the one of step 3
137    # label2 will already be in lower case.
138    if str(label, "ascii").lower() != str(label2, "ascii"):
139        raise UnicodeError("IDNA does not round-trip", label, label2)
140
141    # Step 8: return the result of step 5
142    return result
143
144### Codec APIs
145
146class Codec(codecs.Codec):
147    def encode(self, input, errors='strict'):
148
149        if errors != 'strict':
150            # IDNA is quite clear that implementations must be strict
151            raise UnicodeError("unsupported error handling "+errors)
152
153        if not input:
154            return b'', 0
155
156        try:
157            result = input.encode('ascii')
158        except UnicodeEncodeError:
159            pass
160        else:
161            # ASCII name: fast path
162            labels = result.split(b'.')
163            for label in labels[:-1]:
164                if not (0 < len(label) < 64):
165                    raise UnicodeError("label empty or too long")
166            if len(labels[-1]) >= 64:
167                raise UnicodeError("label too long")
168            return result, len(input)
169
170        result = bytearray()
171        labels = dots.split(input)
172        if labels and not labels[-1]:
173            trailing_dot = b'.'
174            del labels[-1]
175        else:
176            trailing_dot = b''
177        for label in labels:
178            if result:
179                # Join with U+002E
180                result.extend(b'.')
181            result.extend(ToASCII(label))
182        return bytes(result+trailing_dot), len(input)
183
184    def decode(self, input, errors='strict'):
185
186        if errors != 'strict':
187            raise UnicodeError("Unsupported error handling "+errors)
188
189        if not input:
190            return "", 0
191
192        # IDNA allows decoding to operate on Unicode strings, too.
193        if not isinstance(input, bytes):
194            # XXX obviously wrong, see #3232
195            input = bytes(input)
196
197        if ace_prefix not in input:
198            # Fast path
199            try:
200                return input.decode('ascii'), len(input)
201            except UnicodeDecodeError:
202                pass
203
204        labels = input.split(b".")
205
206        if labels and len(labels[-1]) == 0:
207            trailing_dot = '.'
208            del labels[-1]
209        else:
210            trailing_dot = ''
211
212        result = []
213        for label in labels:
214            result.append(ToUnicode(label))
215
216        return ".".join(result)+trailing_dot, len(input)
217
218class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
219    def _buffer_encode(self, input, errors, final):
220        if errors != 'strict':
221            # IDNA is quite clear that implementations must be strict
222            raise UnicodeError("unsupported error handling "+errors)
223
224        if not input:
225            return (b'', 0)
226
227        labels = dots.split(input)
228        trailing_dot = b''
229        if labels:
230            if not labels[-1]:
231                trailing_dot = b'.'
232                del labels[-1]
233            elif not final:
234                # Keep potentially unfinished label until the next call
235                del labels[-1]
236                if labels:
237                    trailing_dot = b'.'
238
239        result = bytearray()
240        size = 0
241        for label in labels:
242            if size:
243                # Join with U+002E
244                result.extend(b'.')
245                size += 1
246            result.extend(ToASCII(label))
247            size += len(label)
248
249        result += trailing_dot
250        size += len(trailing_dot)
251        return (bytes(result), size)
252
253class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
254    def _buffer_decode(self, input, errors, final):
255        if errors != 'strict':
256            raise UnicodeError("Unsupported error handling "+errors)
257
258        if not input:
259            return ("", 0)
260
261        # IDNA allows decoding to operate on Unicode strings, too.
262        if isinstance(input, str):
263            labels = dots.split(input)
264        else:
265            # Must be ASCII string
266            input = str(input, "ascii")
267            labels = input.split(".")
268
269        trailing_dot = ''
270        if labels:
271            if not labels[-1]:
272                trailing_dot = '.'
273                del labels[-1]
274            elif not final:
275                # Keep potentially unfinished label until the next call
276                del labels[-1]
277                if labels:
278                    trailing_dot = '.'
279
280        result = []
281        size = 0
282        for label in labels:
283            result.append(ToUnicode(label))
284            if size:
285                size += 1
286            size += len(label)
287
288        result = ".".join(result) + trailing_dot
289        size += len(trailing_dot)
290        return (result, size)
291
292class StreamWriter(Codec,codecs.StreamWriter):
293    pass
294
295class StreamReader(Codec,codecs.StreamReader):
296    pass
297
298### encodings module API
299
300def getregentry():
301    return codecs.CodecInfo(
302        name='idna',
303        encode=Codec().encode,
304        decode=Codec().decode,
305        incrementalencoder=IncrementalEncoder,
306        incrementaldecoder=IncrementalDecoder,
307        streamwriter=StreamWriter,
308        streamreader=StreamReader,
309    )
310