1# Copyright 2008 The RE2 Authors.  All Rights Reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file.
4
5"""Parser for Unicode data files (as distributed by unicode.org)."""
6
7import os
8import re
9import urllib2
10
11# Directory or URL where Unicode tables reside.
12_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd"
13
14# Largest valid Unicode code value.
15_RUNE_MAX = 0x10FFFF
16
17
18class Error(Exception):
19  """Unicode error base class."""
20
21
22class InputError(Error):
23  """Unicode input error class.  Raised on invalid input."""
24
25
26def _UInt(s):
27  """Converts string to Unicode code point ('263A' => 0x263a).
28
29  Args:
30    s: string to convert
31
32  Returns:
33    Unicode code point
34
35  Raises:
36    InputError: the string is not a valid Unicode value.
37  """
38
39  try:
40    v = int(s, 16)
41  except ValueError:
42    v = -1
43  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
44    raise InputError("invalid Unicode value %s" % (s,))
45  return v
46
47
48def _URange(s):
49  """Converts string to Unicode range.
50
51    '0001..0003' => [1, 2, 3].
52    '0001' => [1].
53
54  Args:
55    s: string to convert
56
57  Returns:
58    Unicode range
59
60  Raises:
61    InputError: the string is not a valid Unicode range.
62  """
63  a = s.split("..")
64  if len(a) == 1:
65    return [_UInt(a[0])]
66  if len(a) == 2:
67    lo = _UInt(a[0])
68    hi = _UInt(a[1])
69    if lo < hi:
70      return range(lo, hi + 1)
71  raise InputError("invalid Unicode range %s" % (s,))
72
73
74def _UStr(v):
75  """Converts Unicode code point to hex string.
76
77    0x263a => '0x263A'.
78
79  Args:
80    v: code point to convert
81
82  Returns:
83    Unicode string
84
85  Raises:
86    InputError: the argument is not a valid Unicode value.
87  """
88  if v < 0 or v > _RUNE_MAX:
89    raise InputError("invalid Unicode value %s" % (v,))
90  return "0x%04X" % (v,)
91
92
93def _ParseContinue(s):
94  """Parses a Unicode continuation field.
95
96  These are of the form '<Name, First>' or '<Name, Last>'.
97  Instead of giving an explicit range in a single table entry,
98  some Unicode tables use two entries, one for the first
99  code value in the range and one for the last.
100  The first entry's description is '<Name, First>' instead of 'Name'
101  and the second is '<Name, Last>'.
102
103    '<Name, First>' => ('Name', 'First')
104    '<Name, Last>' => ('Name', 'Last')
105    'Anything else' => ('Anything else', None)
106
107  Args:
108    s: continuation field string
109
110  Returns:
111    pair: name and ('First', 'Last', or None)
112  """
113
114  match = re.match("<(.*), (First|Last)>", s)
115  if match is not None:
116    return match.groups()
117  return (s, None)
118
119
120def ReadUnicodeTable(filename, nfields, doline):
121  """Generic Unicode table text file reader.
122
123  The reader takes care of stripping out comments and also
124  parsing the two different ways that the Unicode tables specify
125  code ranges (using the .. notation and splitting the range across
126  multiple lines).
127
128  Each non-comment line in the table is expected to have the given
129  number of fields.  The first field is known to be the Unicode value
130  and the second field its description.
131
132  The reader calls doline(codes, fields) for each entry in the table.
133  If fn raises an exception, the reader prints that exception,
134  prefixed with the file name and line number, and continues
135  processing the file.  When done with the file, the reader re-raises
136  the first exception encountered during the file.
137
138  Arguments:
139    filename: the Unicode data file to read, or a file-like object.
140    nfields: the number of expected fields per line in that file.
141    doline: the function to call for each table entry.
142
143  Raises:
144    InputError: nfields is invalid (must be >= 2).
145  """
146
147  if nfields < 2:
148    raise InputError("invalid number of fields %d" % (nfields,))
149
150  if type(filename) == str:
151    if filename.startswith("http://"):
152      fil = urllib2.urlopen(filename)
153    else:
154      fil = open(filename, "r")
155  else:
156    fil = filename
157
158  first = None        # first code in multiline range
159  expect_last = None  # tag expected for "Last" line in multiline range
160  lineno = 0          # current line number
161  for line in fil:
162    lineno += 1
163    try:
164      # Chop # comments and white space; ignore empty lines.
165      sharp = line.find("#")
166      if sharp >= 0:
167        line = line[:sharp]
168      line = line.strip()
169      if not line:
170        continue
171
172      # Split fields on ";", chop more white space.
173      # Must have the expected number of fields.
174      fields = [s.strip() for s in line.split(";")]
175      if len(fields) != nfields:
176        raise InputError("wrong number of fields %d %d - %s" %
177                         (len(fields), nfields, line))
178
179      # The Unicode text files have two different ways
180      # to list a Unicode range.  Either the first field is
181      # itself a range (0000..FFFF), or the range is split
182      # across two lines, with the second field noting
183      # the continuation.
184      codes = _URange(fields[0])
185      (name, cont) = _ParseContinue(fields[1])
186
187      if expect_last is not None:
188        # If the last line gave the First code in a range,
189        # this one had better give the Last one.
190        if (len(codes) != 1 or codes[0] <= first or
191            cont != "Last" or name != expect_last):
192          raise InputError("expected Last line for %s" %
193                           (expect_last,))
194        codes = range(first, codes[0] + 1)
195        first = None
196        expect_last = None
197        fields[0] = "%04X..%04X" % (codes[0], codes[-1])
198        fields[1] = name
199      elif cont == "First":
200        # Otherwise, if this is the First code in a range,
201        # remember it and go to the next line.
202        if len(codes) != 1:
203          raise InputError("bad First line: range given")
204        expect_last = name
205        first = codes[0]
206        continue
207
208      doline(codes, fields)
209
210    except Exception, e:
211      print "%s:%d: %s" % (filename, lineno, e)
212      raise
213
214  if expect_last is not None:
215    raise InputError("expected Last line for %s; got EOF" %
216                     (expect_last,))
217
218
219def CaseGroups(unicode_dir=_UNICODE_DIR):
220  """Returns list of Unicode code groups equivalent under case folding.
221
222  Each group is a sorted list of code points,
223  and the list of groups is sorted by first code point
224  in the group.
225
226  Args:
227    unicode_dir: Unicode data directory
228
229  Returns:
230    list of Unicode code groups
231  """
232
233  # Dict mapping lowercase code point to fold-equivalent group.
234  togroup = {}
235
236  def DoLine(codes, fields):
237    """Process single CaseFolding.txt line, updating togroup."""
238    (_, foldtype, lower, _) = fields
239    if foldtype not in ("C", "S"):
240      return
241    lower = _UInt(lower)
242    togroup.setdefault(lower, [lower]).extend(codes)
243
244  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
245
246  groups = togroup.values()
247  for g in groups:
248    g.sort()
249  groups.sort()
250  return togroup, groups
251
252
253def Scripts(unicode_dir=_UNICODE_DIR):
254  """Returns dict mapping script names to code lists.
255
256  Args:
257    unicode_dir: Unicode data directory
258
259  Returns:
260    dict mapping script names to code lists
261  """
262
263  scripts = {}
264
265  def DoLine(codes, fields):
266    """Process single Scripts.txt line, updating scripts."""
267    (_, name) = fields
268    scripts.setdefault(name, []).extend(codes)
269
270  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
271  return scripts
272
273
274def Categories(unicode_dir=_UNICODE_DIR):
275  """Returns dict mapping category names to code lists.
276
277  Args:
278    unicode_dir: Unicode data directory
279
280  Returns:
281    dict mapping category names to code lists
282  """
283
284  categories = {}
285
286  def DoLine(codes, fields):
287    """Process single UnicodeData.txt line, updating categories."""
288    category = fields[2]
289    categories.setdefault(category, []).extend(codes)
290    # Add codes from Lu into L, etc.
291    if len(category) > 1:
292      short = category[0]
293      categories.setdefault(short, []).extend(codes)
294
295  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
296  return categories
297
298