1# Copyright 2008 The RE2 Authors.  All Rights Reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file.
4
5"""Parser for Unicode data files (as distributed by unicode.org)."""
6
7from __future__ import absolute_import
8from __future__ import division
9from __future__ import print_function
10
11import os
12import re
13from six.moves import urllib
14
15# Directory or URL where Unicode tables reside.
16_UNICODE_DIR = "https://www.unicode.org/Public/13.0.0/ucd"
17
18# Largest valid Unicode code value.
19_RUNE_MAX = 0x10FFFF
20
21
22class Error(Exception):
23  """Unicode error base class."""
24
25
26class InputError(Error):
27  """Unicode input error class.  Raised on invalid input."""
28
29
30def _UInt(s):
31  """Converts string to Unicode code point ('263A' => 0x263a).
32
33  Args:
34    s: string to convert
35
36  Returns:
37    Unicode code point
38
39  Raises:
40    InputError: the string is not a valid Unicode value.
41  """
42
43  try:
44    v = int(s, 16)
45  except ValueError:
46    v = -1
47  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
48    raise InputError("invalid Unicode value %s" % (s,))
49  return v
50
51
52def _URange(s):
53  """Converts string to Unicode range.
54
55    '0001..0003' => [1, 2, 3].
56    '0001' => [1].
57
58  Args:
59    s: string to convert
60
61  Returns:
62    Unicode range
63
64  Raises:
65    InputError: the string is not a valid Unicode range.
66  """
67  a = s.split("..")
68  if len(a) == 1:
69    return [_UInt(a[0])]
70  if len(a) == 2:
71    lo = _UInt(a[0])
72    hi = _UInt(a[1])
73    if lo < hi:
74      return range(lo, hi + 1)
75  raise InputError("invalid Unicode range %s" % (s,))
76
77
78def _UStr(v):
79  """Converts Unicode code point to hex string.
80
81    0x263a => '0x263A'.
82
83  Args:
84    v: code point to convert
85
86  Returns:
87    Unicode string
88
89  Raises:
90    InputError: the argument is not a valid Unicode value.
91  """
92  if v < 0 or v > _RUNE_MAX:
93    raise InputError("invalid Unicode value %s" % (v,))
94  return "0x%04X" % (v,)
95
96
97def _ParseContinue(s):
98  """Parses a Unicode continuation field.
99
100  These are of the form '<Name, First>' or '<Name, Last>'.
101  Instead of giving an explicit range in a single table entry,
102  some Unicode tables use two entries, one for the first
103  code value in the range and one for the last.
104  The first entry's description is '<Name, First>' instead of 'Name'
105  and the second is '<Name, Last>'.
106
107    '<Name, First>' => ('Name', 'First')
108    '<Name, Last>' => ('Name', 'Last')
109    'Anything else' => ('Anything else', None)
110
111  Args:
112    s: continuation field string
113
114  Returns:
115    pair: name and ('First', 'Last', or None)
116  """
117
118  match = re.match("<(.*), (First|Last)>", s)
119  if match is not None:
120    return match.groups()
121  return (s, None)
122
123
124def ReadUnicodeTable(filename, nfields, doline):
125  """Generic Unicode table text file reader.
126
127  The reader takes care of stripping out comments and also
128  parsing the two different ways that the Unicode tables specify
129  code ranges (using the .. notation and splitting the range across
130  multiple lines).
131
132  Each non-comment line in the table is expected to have the given
133  number of fields.  The first field is known to be the Unicode value
134  and the second field its description.
135
136  The reader calls doline(codes, fields) for each entry in the table.
137  If fn raises an exception, the reader prints that exception,
138  prefixed with the file name and line number, and continues
139  processing the file.  When done with the file, the reader re-raises
140  the first exception encountered during the file.
141
142  Arguments:
143    filename: the Unicode data file to read, or a file-like object.
144    nfields: the number of expected fields per line in that file.
145    doline: the function to call for each table entry.
146
147  Raises:
148    InputError: nfields is invalid (must be >= 2).
149  """
150
151  if nfields < 2:
152    raise InputError("invalid number of fields %d" % (nfields,))
153
154  if type(filename) == str:
155    if filename.startswith("https://"):
156      fil = urllib.request.urlopen(filename)
157    else:
158      fil = open(filename, "rb")
159  else:
160    fil = filename
161
162  first = None        # first code in multiline range
163  expect_last = None  # tag expected for "Last" line in multiline range
164  lineno = 0          # current line number
165  for line in fil:
166    lineno += 1
167    try:
168      line = line.decode('latin1')
169
170      # Chop # comments and white space; ignore empty lines.
171      sharp = line.find("#")
172      if sharp >= 0:
173        line = line[:sharp]
174      line = line.strip()
175      if not line:
176        continue
177
178      # Split fields on ";", chop more white space.
179      # Must have the expected number of fields.
180      fields = [s.strip() for s in line.split(";")]
181      if len(fields) != nfields:
182        raise InputError("wrong number of fields %d %d - %s" %
183                         (len(fields), nfields, line))
184
185      # The Unicode text files have two different ways
186      # to list a Unicode range.  Either the first field is
187      # itself a range (0000..FFFF), or the range is split
188      # across two lines, with the second field noting
189      # the continuation.
190      codes = _URange(fields[0])
191      (name, cont) = _ParseContinue(fields[1])
192
193      if expect_last is not None:
194        # If the last line gave the First code in a range,
195        # this one had better give the Last one.
196        if (len(codes) != 1 or codes[0] <= first or
197            cont != "Last" or name != expect_last):
198          raise InputError("expected Last line for %s" %
199                           (expect_last,))
200        codes = range(first, codes[0] + 1)
201        first = None
202        expect_last = None
203        fields[0] = "%04X..%04X" % (codes[0], codes[-1])
204        fields[1] = name
205      elif cont == "First":
206        # Otherwise, if this is the First code in a range,
207        # remember it and go to the next line.
208        if len(codes) != 1:
209          raise InputError("bad First line: range given")
210        expect_last = name
211        first = codes[0]
212        continue
213
214      doline(codes, fields)
215
216    except Exception as e:
217      print("%s:%d: %s" % (filename, lineno, e))
218      raise
219
220  if expect_last is not None:
221    raise InputError("expected Last line for %s; got EOF" %
222                     (expect_last,))
223
224
225def CaseGroups(unicode_dir=_UNICODE_DIR):
226  """Returns list of Unicode code groups equivalent under case folding.
227
228  Each group is a sorted list of code points,
229  and the list of groups is sorted by first code point
230  in the group.
231
232  Args:
233    unicode_dir: Unicode data directory
234
235  Returns:
236    list of Unicode code groups
237  """
238
239  # Dict mapping lowercase code point to fold-equivalent group.
240  togroup = {}
241
242  def DoLine(codes, fields):
243    """Process single CaseFolding.txt line, updating togroup."""
244    (_, foldtype, lower, _) = fields
245    if foldtype not in ("C", "S"):
246      return
247    lower = _UInt(lower)
248    togroup.setdefault(lower, [lower]).extend(codes)
249
250  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
251
252  groups = list(togroup.values())
253  for g in groups:
254    g.sort()
255  groups.sort()
256  return togroup, groups
257
258
259def Scripts(unicode_dir=_UNICODE_DIR):
260  """Returns dict mapping script names to code lists.
261
262  Args:
263    unicode_dir: Unicode data directory
264
265  Returns:
266    dict mapping script names to code lists
267  """
268
269  scripts = {}
270
271  def DoLine(codes, fields):
272    """Process single Scripts.txt line, updating scripts."""
273    (_, name) = fields
274    scripts.setdefault(name, []).extend(codes)
275
276  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
277  return scripts
278
279
280def Categories(unicode_dir=_UNICODE_DIR):
281  """Returns dict mapping category names to code lists.
282
283  Args:
284    unicode_dir: Unicode data directory
285
286  Returns:
287    dict mapping category names to code lists
288  """
289
290  categories = {}
291
292  def DoLine(codes, fields):
293    """Process single UnicodeData.txt line, updating categories."""
294    category = fields[2]
295    categories.setdefault(category, []).extend(codes)
296    # Add codes from Lu into L, etc.
297    if len(category) > 1:
298      short = category[0]
299      categories.setdefault(short, []).extend(codes)
300
301  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
302  return categories
303
304