1#!/usr/bin/env python
2"""
3Tools to parse data files from the Unicode Character Database.
4"""
5
6from __future__ import print_function, absolute_import, division
7from __future__ import unicode_literals
8
9try:
10    from urllib.request import urlopen
11except ImportError:
12    from urllib2 import urlopen
13from contextlib import closing, contextmanager
14import re
15from codecs import iterdecode
16import logging
17import os
18from io import open
19from os.path import abspath, dirname, join as pjoin, pardir, sep
20
21
22try:  # pragma: no cover
23    unicode
24except NameError:
25    unicode = str
26
27
28UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
29UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
30
31# by default save output files to ../Lib/fontTools/unicodedata/
32UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir,
33                     "Lib", "fontTools", "unicodedata") + sep
34
35SRC_ENCODING = "# -*- coding: utf-8 -*-\n"
36
37NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"
38
39MAX_UNICODE = 0x10FFFF
40
41log = logging.getLogger()
42
43
44@contextmanager
45def open_unidata_file(filename):
46    """Open a text file from https://unicode.org/Public/UNIDATA/"""
47    url = UNIDATA_URL + filename
48    with closing(urlopen(url)) as response:
49        yield iterdecode(response, encoding="utf-8")
50
51
52def parse_unidata_header(infile):
53    """Read the top header of data files, until the first line
54    that does not start with '#'.
55    """
56    header = []
57    line = next(infile)
58    while line.startswith("#"):
59        header.append(line)
60        line = next(infile)
61    return "".join(header)
62
63
64def parse_range_properties(infile, default=None, is_set=False):
65    """Parse a Unicode data file containing a column with one character or
66    a range of characters, and another column containing a property value
67    separated by a semicolon. Comments after '#' are ignored.
68
69    If the ranges defined in the data file are not continuous, assign the
70    'default' property to the unassigned codepoints.
71
72    Return a list of (start, end, property_name) tuples.
73    """
74    ranges = []
75    line_regex = re.compile(
76        r"^"
77        r"([0-9A-F]{4,6})"  # first character code
78        r"(?:\.\.([0-9A-F]{4,6}))?"  # optional second character code
79        r"\s*;\s*"
80        r"([^#]+)")  # everything up to the potential comment
81    for line in infile:
82        match = line_regex.match(line)
83        if not match:
84            continue
85
86        first, last, data = match.groups()
87        if last is None:
88            last = first
89
90        first = int(first, 16)
91        last = int(last, 16)
92        data = str(data.rstrip())
93
94        ranges.append((first, last, data))
95
96    ranges.sort()
97
98    if isinstance(default, unicode):
99        default = str(default)
100
101    # fill the gaps between explicitly defined ranges
102    last_start, last_end = -1, -1
103    full_ranges = []
104    for start, end, value in ranges:
105        assert last_end < start
106        assert start <= end
107        if start - last_end > 1:
108            full_ranges.append((last_end+1, start-1, default))
109        if is_set:
110            value = set(value.split())
111        full_ranges.append((start, end, value))
112        last_start, last_end = start, end
113    if last_end != MAX_UNICODE:
114        full_ranges.append((last_end+1, MAX_UNICODE, default))
115
116    # reduce total number of ranges by combining continuous ones
117    last_start, last_end, last_value = full_ranges.pop(0)
118    merged_ranges = []
119    for start, end, value in full_ranges:
120        if value == last_value:
121            continue
122        else:
123            merged_ranges.append((last_start, start-1, last_value))
124            last_start, line_end, last_value = start, end, value
125    merged_ranges.append((last_start, MAX_UNICODE, last_value))
126
127    # make sure that the ranges cover the full unicode repertoire
128    assert merged_ranges[0][0] == 0
129    for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]):
130        assert ce+1 == ns
131    assert merged_ranges[-1][1] == MAX_UNICODE
132
133    return merged_ranges
134
135
136def parse_semicolon_separated_data(infile):
137    """Parse a Unicode data file where each line contains a lists of values
138    separated by a semicolon (e.g. "PropertyValueAliases.txt").
139    The number of the values on different lines may be different.
140
141    Returns a list of lists each containing the values as strings.
142    """
143    data = []
144    for line in infile:
145        line = line.split('#', 1)[0].strip()  # remove the comment
146        if not line:
147            continue
148        fields = [str(field.strip()) for field in line.split(';')]
149        data.append(fields)
150    return data
151
152
153def _set_repr(value):
154    return 'None' if value is None else "{{{}}}".format(
155        ", ".join(repr(v) for v in sorted(value)))
156
157
158def build_ranges(filename, local_ucd=None, output_path=None,
159                 default=None, is_set=False, aliases=None):
160    """Fetch 'filename' UCD data file from Unicode official website, parse
161    the property ranges and values and write them as two Python lists
162    to 'fontTools.unicodedata.<filename>.py'.
163
164    'aliases' is an optional mapping of property codes (short names) to long
165    name aliases (list of strings, with the first item being the preferred
166    alias). When this is provided, the property values are written using the
167    short notation, and an additional 'NAMES' dict with the aliases is
168    written to the output module.
169
170    To load the data file from a local directory, you can use the
171    'local_ucd' argument.
172    """
173    modname = os.path.splitext(filename)[0] + ".py"
174    if not output_path:
175        output_path = UNIDATA_PATH + modname
176
177    if local_ucd:
178        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
179        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
180    else:
181        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
182        cm = open_unidata_file(filename)
183
184    with cm as f:
185        header = parse_unidata_header(f)
186        ranges = parse_range_properties(f, default=default, is_set=is_set)
187
188    if aliases:
189        reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
190        max_value_length = 6  # 4-letter tags plus two quotes for repr
191    else:
192        max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges))
193
194    with open(output_path, "w", encoding="utf-8") as f:
195        f.write(SRC_ENCODING)
196        f.write("#\n")
197        f.write(NOTICE)
198        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
199        f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
200        f.write("#\n")
201        f.write(header+"\n\n")
202
203        f.write("RANGES = [\n")
204        for first, last, value in ranges:
205            f.write("    0x{:0>4X},  # .. 0x{:0>4X} ; {}\n".format(
206                first, last, _set_repr(value) if is_set else value))
207        f.write("]\n")
208
209        f.write("\n")
210        f.write("VALUES = [\n")
211        for first, last, value in ranges:
212            comment = "# {:0>4X}..{:0>4X}".format(first, last)
213            if is_set:
214                value_repr = "{},".format(_set_repr(value))
215            else:
216                if aliases:
217                    # append long name to comment and use the short code
218                    comment += " ; {}".format(value)
219                    value = reversed_aliases[normalize(value)]
220                value_repr = "{!r},".format(value)
221            f.write("    {}  {}\n".format(
222                value_repr.ljust(max_value_length+1), comment))
223        f.write("]\n")
224
225        if aliases:
226            f.write("\n")
227            f.write("NAMES = {\n")
228            for value, names in sorted(aliases.items()):
229                # we only write the first preferred alias
230                f.write("    {!r}: {!r},\n".format(value, names[0]))
231            f.write("}\n")
232
233    log.info("saved new file: '%s'", os.path.normpath(output_path))
234
235
236_normalize_re = re.compile(r"[-_ ]+")
237
238def normalize(string):
239    """Remove case, strip space, '-' and '_' for loose matching."""
240    return _normalize_re.sub("", string).lower()
241
242
243def parse_property_value_aliases(property_tag, local_ucd=None):
244    """Fetch the current 'PropertyValueAliases.txt' from the Unicode website,
245    parse the values for the specified 'property_tag' and return a dictionary
246    of name aliases (list of strings) keyed by short value codes (strings).
247
248    To load the data file from a local directory, you can use the
249    'local_ucd' argument.
250    """
251    filename = "PropertyValueAliases.txt"
252    if local_ucd:
253        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
254        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
255    else:
256        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
257        cm = open_unidata_file(filename)
258
259    with cm as f:
260        header = parse_unidata_header(f)
261        data = parse_semicolon_separated_data(f)
262
263    aliases = {item[1]: item[2:] for item in data
264               if item[0] == property_tag}
265
266    return aliases
267
268
269def main():
270    import argparse
271
272    parser = argparse.ArgumentParser(
273        description="Generate fontTools.unicodedata from UCD data files")
274    parser.add_argument(
275        '--ucd-path', help="Path to local folder containing UCD data files")
276    parser.add_argument('-q', '--quiet', action="store_true")
277    options = parser.parse_args()
278
279    level = "WARNING" if options.quiet else "INFO"
280    logging.basicConfig(level=level, format="%(message)s")
281
282    build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block")
283
284    script_aliases = parse_property_value_aliases("sc", options.ucd_path)
285    build_ranges("Scripts.txt", local_ucd=options.ucd_path, default="Unknown",
286                 aliases=script_aliases)
287    build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path,
288                 is_set=True)
289
290
291if __name__ == "__main__":
292    import sys
293    sys.exit(main())
294