1#!/usr/bin/env python
2# -*- coding: utf-8
3#
4# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
5# file at the top-level directory of this distribution and at
6# http://rust-lang.org/COPYRIGHT.
7#
8# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
9# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
10# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
11# option. This file may not be copied, modified, or distributed
12# except according to those terms.
13
14# This script uses the following Unicode tables:
15# - auxiliary/GraphemeBreakTest.txt
16# - auxiliary/WordBreakTest.txt
17#
18# Since this should not require frequent updates, we just store this
19# out-of-line and check the unicode.rs file into git.
20from __future__ import print_function
21
22import unicode, re, os, fileinput
23
24def load_test_data(f, optsplit=[]):
25    testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
26
27    unicode.fetch(f)
28    data = []
29    for line in fileinput.input(os.path.basename(f)):
30        # lines that include a test start with the ÷ character
31        if len(line) < 2 or not line.startswith('÷'):
32            continue
33
34        m = testRe1.match(line)
35        if not m:
36            print("error: no match on line where test was expected: %s" % line)
37            continue
38
39        # process the characters in this test case
40        chars = process_split_string(m.group(1))
41        # skip test case if it contains invalid characters (viz., surrogates)
42        if not chars:
43            continue
44
45        # now process test cases
46        (chars, info) = process_split_info(m.group(2), chars, optsplit)
47
48        # make sure that we have break info for each break!
49        assert len(chars) - 1 == len(info)
50
51        data.append((chars, info))
52
53    return data
54
55def process_split_info(s, c, o):
56    outcs = []
57    outis = []
58    workcs = c.pop(0)
59
60    # are we on a × or a ÷?
61    isX = False
62    if s.startswith('×'):
63        isX = True
64
65    # find each instance of '(÷|×) [x.y] '
66    while s:
67        # find the currently considered rule number
68        sInd = s.index('[') + 1
69        eInd = s.index(']')
70
71        # if it's '× [a.b]' where 'a.b' is in o, then
72        # we consider it a split even though it's not
73        # marked as one
74        # if it's ÷ then it's always a split
75        if not isX or s[sInd:eInd] in o:
76            outis.append(s[sInd:eInd])
77            outcs.append(workcs)
78            workcs = c.pop(0)
79        else:
80            workcs.extend(c.pop(0))
81
82        idx = 1
83        while idx < len(s):
84            if s[idx:].startswith('×'):
85                isX = True
86                break
87            if s[idx:].startswith('÷'):
88                isX = False
89                break
90            idx += 1
91        s = s[idx:]
92
93    outcs.append(workcs)
94    return (outcs, outis)
95
96def process_split_string(s):
97    outls = []
98    workls = []
99
100    inls = s.split()
101
102    for i in inls:
103        if i == '÷' or i == '×':
104            outls.append(workls)
105            workls = []
106            continue
107
108        ival = int(i,16)
109
110        if unicode.is_surrogate(ival):
111            return []
112
113        workls.append(ival)
114
115    if workls:
116        outls.append(workls)
117
118    return outls
119
120def showfun(x):
121    outstr = '("'
122    for c in x[0]:
123        outstr += "\\u{%x}" % c
124    outstr += '",&['
125    xfirst = True
126    for xx in x[1:]:
127        if not xfirst:
128            outstr += '],&['
129        xfirst = False
130        sfirst = True
131        for sp in xx:
132            if not sfirst:
133                outstr += ','
134            sfirst = False
135            outstr += '"'
136            for c in sp:
137                outstr += "\\u{%x}" % c
138            outstr += '"'
139    outstr += '])'
140    return outstr
141
142def create_grapheme_data(f):
143    # rules 9.1 and 9.2 are for extended graphemes only
144    optsplits = ['9.1','9.2']
145    d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
146
147    test_same = []
148    test_diff = []
149
150    for (c, i) in d:
151        allchars = [cn for s in c for cn in s]
152        extgraphs = []
153        extwork = []
154
155        extwork.extend(c[0])
156        for n in range(0,len(i)):
157            if i[n] in optsplits:
158                extwork.extend(c[n+1])
159            else:
160                extgraphs.append(extwork)
161                extwork = []
162                extwork.extend(c[n+1])
163
164        # these are the extended grapheme clusters
165        extgraphs.append(extwork)
166
167        if extgraphs == c:
168            test_same.append((allchars, c))
169        else:
170            test_diff.append((allchars, extgraphs, c))
171
172    stype = "&'static [(&'static str, &'static [&'static str])]"
173    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174    f.write("    // official Unicode test data\n")
175    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
176    unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
177    unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
178
179def create_words_data(f):
180    d = load_test_data("auxiliary/WordBreakTest.txt")
181
182    test = []
183
184    for (c, i) in d:
185        allchars = [cn for s in c for cn in s]
186        test.append((allchars, c))
187
188    wtype = "&'static [(&'static str, &'static [&'static str])]"
189    f.write("    // official Unicode test data\n")
190    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
191    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
192
193def create_sentence_data(f):
194    d = load_test_data("auxiliary/SentenceBreakTest.txt")
195
196    test = []
197
198    for (c, i) in d:
199        allchars = [cn for s in c for cn in s]
200        test.append((allchars, c))
201
202    wtype = "&'static [(&'static str, &'static [&'static str])]"
203    f.write("    // official Unicode test data\n")
204    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
205    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
206
207if __name__ == "__main__":
208    with open("testdata.rs", "w") as rf:
209        rf.write(unicode.preamble)
210        create_grapheme_data(rf)
211        create_words_data(rf)
212        create_sentence_data(rf)
213