1#!/usr/bin/env python
2
3from __future__ import print_function, division, absolute_import
4
5import io, sys
6
7if len (sys.argv) != 5:
8	print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
9	sys.exit (1)
10
11BLACKLISTED_BLOCKS = ["Thai", "Lao"]
12
13files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
14
15headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
16headers.append (["UnicodeData.txt does not have a header."])
17
18data = [{} for f in files]
19values = [{} for f in files]
20for i, f in enumerate (files):
21	for line in f:
22
23		j = line.find ('#')
24		if j >= 0:
25			line = line[:j]
26
27		fields = [x.strip () for x in line.split (';')]
28		if len (fields) == 1:
29			continue
30
31		uu = fields[0].split ('..')
32		start = int (uu[0], 16)
33		if len (uu) == 1:
34			end = start
35		else:
36			end = int (uu[1], 16)
37
38		t = fields[1 if i != 2 else 2]
39
40		for u in range (start, end + 1):
41			data[i][u] = t
42		values[i][t] = values[i].get (t, 0) + end - start + 1
43
44defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
45
46# TODO Characters that are not in Unicode Indic files, but used in USE
47data[0][0x034F] = defaults[0]
48data[0][0x2060] = defaults[0]
49data[0][0x20F0] = defaults[0]
50# TODO https://github.com/roozbehp/unicode-data/issues/9
51data[0][0x11C44] = 'Consonant_Placeholder'
52data[0][0x11C45] = 'Consonant_Placeholder'
53# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
54data[0][0x111C8] = 'Consonant_Placeholder'
55for u in range (0xFE00, 0xFE0F + 1):
56	data[0][u] = defaults[0]
57
58# Merge data into one dict:
59for i,v in enumerate (defaults):
60	values[i][v] = values[i].get (v, 0) + 1
61combined = {}
62for i,d in enumerate (data):
63	for u,v in d.items ():
64		if i >= 2 and not u in combined:
65			continue
66		if not u in combined:
67			combined[u] = list (defaults)
68		combined[u][i] = v
69combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
70data = combined
71del combined
72num = len (data)
73
74
75property_names = [
76	# General_Category
77	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
78	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
79	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
80	# Indic_Syllabic_Category
81	'Other',
82	'Bindu',
83	'Visarga',
84	'Avagraha',
85	'Nukta',
86	'Virama',
87	'Pure_Killer',
88	'Invisible_Stacker',
89	'Vowel_Independent',
90	'Vowel_Dependent',
91	'Vowel',
92	'Consonant_Placeholder',
93	'Consonant',
94	'Consonant_Dead',
95	'Consonant_With_Stacker',
96	'Consonant_Prefixed',
97	'Consonant_Preceding_Repha',
98	'Consonant_Succeeding_Repha',
99	'Consonant_Subjoined',
100	'Consonant_Medial',
101	'Consonant_Final',
102	'Consonant_Head_Letter',
103	'Consonant_Initial_Postfixed',
104	'Modifying_Letter',
105	'Tone_Letter',
106	'Tone_Mark',
107	'Gemination_Mark',
108	'Cantillation_Mark',
109	'Register_Shifter',
110	'Syllable_Modifier',
111	'Consonant_Killer',
112	'Non_Joiner',
113	'Joiner',
114	'Number_Joiner',
115	'Number',
116	'Brahmi_Joining_Number',
117	# Indic_Positional_Category
118	'Not_Applicable',
119	'Right',
120	'Left',
121	'Visual_Order_Left',
122	'Left_And_Right',
123	'Top',
124	'Bottom',
125	'Top_And_Bottom',
126	'Top_And_Right',
127	'Top_And_Left',
128	'Top_And_Left_And_Right',
129	'Bottom_And_Left',
130	'Bottom_And_Right',
131	'Top_And_Bottom_And_Right',
132	'Overstruck',
133]
134
135try:
136	basestring
137except NameError:
138	basestring = str
139
140class PropertyValue(object):
141	def __init__(self, name_):
142		self.name = name_
143	def __str__(self):
144		return self.name
145	def __eq__(self, other):
146		return self.name == (other if isinstance(other, basestring) else other.name)
147	def __ne__(self, other):
148		return not (self == other)
149	def __hash__(self):
150		return hash(str(self))
151
152property_values = {}
153
154for name in property_names:
155	value = PropertyValue(name)
156	assert value not in property_values
157	assert value not in globals()
158	property_values[name] = value
159globals().update(property_values)
160
161
162def is_BASE(U, UISC, UGC):
163	return (UISC in [Number, Consonant, Consonant_Head_Letter,
164			#SPEC-DRAFT Consonant_Placeholder,
165			Tone_Letter,
166			Vowel_Independent #SPEC-DRAFT
167			] or
168		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
169					Consonant_Subjoined, Vowel, Vowel_Dependent]))
170def is_BASE_IND(U, UISC, UGC):
171	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
172	return (UISC in [Consonant_Dead, Modifying_Letter] or
173		(UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
174		False # SPEC-DRAFT-OUTDATED! U == 0x002D
175		)
176def is_BASE_NUM(U, UISC, UGC):
177	return UISC == Brahmi_Joining_Number
178def is_BASE_OTHER(U, UISC, UGC):
179	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
180	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
181	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
182def is_CGJ(U, UISC, UGC):
183	return U == 0x034F
184def is_CONS_FINAL(U, UISC, UGC):
185	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
186	return ((UISC == Consonant_Final and UGC != Lo) or
187		UISC == Consonant_Initial_Postfixed or
188		UISC == Consonant_Succeeding_Repha)
189def is_CONS_FINAL_MOD(U, UISC, UGC):
190	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
191	return  UISC == Syllable_Modifier
192def is_CONS_MED(U, UISC, UGC):
193	return UISC == Consonant_Medial and UGC != Lo
194def is_CONS_MOD(U, UISC, UGC):
195	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
196def is_CONS_SUB(U, UISC, UGC):
197	#SPEC-DRAFT return UISC == Consonant_Subjoined
198	return UISC == Consonant_Subjoined and UGC != Lo
199def is_CONS_WITH_STACKER(U, UISC, UGC):
200	return UISC == Consonant_With_Stacker
201def is_HALANT(U, UISC, UGC):
202	return UISC in [Virama, Invisible_Stacker] and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
203def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
204	# https://github.com/harfbuzz/harfbuzz/issues/1102
205	# https://github.com/harfbuzz/harfbuzz/issues/1379
206	return U in [0x11046, 0x1134D]
207def is_HALANT_NUM(U, UISC, UGC):
208	return UISC == Number_Joiner
209def is_ZWNJ(U, UISC, UGC):
210	return UISC == Non_Joiner
211def is_ZWJ(U, UISC, UGC):
212	return UISC == Joiner
213def is_Word_Joiner(U, UISC, UGC):
214	return U == 0x2060
215def is_OTHER(U, UISC, UGC):
216	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
217	return (UISC == Other
218		and not is_SYM_MOD(U, UISC, UGC)
219		and not is_CGJ(U, UISC, UGC)
220		and not is_Word_Joiner(U, UISC, UGC)
221		and not is_VARIATION_SELECTOR(U, UISC, UGC)
222	)
223def is_Reserved(U, UISC, UGC):
224	return UGC == 'Cn'
225def is_REPHA(U, UISC, UGC):
226	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
227def is_SYM(U, UISC, UGC):
228	if U == 0x25CC: return False #SPEC-DRAFT
229	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
230	return UGC in [So, Sc]
231def is_SYM_MOD(U, UISC, UGC):
232	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
233def is_VARIATION_SELECTOR(U, UISC, UGC):
234	return 0xFE00 <= U <= 0xFE0F
235def is_VOWEL(U, UISC, UGC):
236	# https://github.com/roozbehp/unicode-data/issues/6
237	return (UISC == Pure_Killer or
238		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
239def is_VOWEL_MOD(U, UISC, UGC):
240	# https://github.com/roozbehp/unicode-data/issues/6
241	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
242		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
243
244use_mapping = {
245	'B':	is_BASE,
246	'IND':	is_BASE_IND,
247	'N':	is_BASE_NUM,
248	'GB':	is_BASE_OTHER,
249	'CGJ':	is_CGJ,
250	'F':	is_CONS_FINAL,
251	'FM':	is_CONS_FINAL_MOD,
252	'M':	is_CONS_MED,
253	'CM':	is_CONS_MOD,
254	'SUB':	is_CONS_SUB,
255	'CS':	is_CONS_WITH_STACKER,
256	'H':	is_HALANT,
257	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
258	'HN':	is_HALANT_NUM,
259	'ZWNJ':	is_ZWNJ,
260	'ZWJ':	is_ZWJ,
261	'WJ':	is_Word_Joiner,
262	'O':	is_OTHER,
263	'Rsv':	is_Reserved,
264	'R':	is_REPHA,
265	'S':	is_SYM,
266	'SM':	is_SYM_MOD,
267	'VS':	is_VARIATION_SELECTOR,
268	'V':	is_VOWEL,
269	'VM':	is_VOWEL_MOD,
270}
271
272use_positions = {
273	'F': {
274		'Abv': [Top],
275		'Blw': [Bottom],
276		'Pst': [Right],
277	},
278	'M': {
279		'Abv': [Top],
280		'Blw': [Bottom, Bottom_And_Left],
281		'Pst': [Right],
282		'Pre': [Left],
283	},
284	'CM': {
285		'Abv': [Top],
286		'Blw': [Bottom],
287	},
288	'V': {
289		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
290		'Blw': [Bottom, Overstruck, Bottom_And_Right],
291		'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
292		'Pre': [Left],
293	},
294	'VM': {
295		'Abv': [Top],
296		'Blw': [Bottom, Overstruck],
297		'Pst': [Right],
298		'Pre': [Left],
299	},
300	'SM': {
301		'Abv': [Top],
302		'Blw': [Bottom],
303	},
304	'H': None,
305	'HVM': None,
306	'B': None,
307	'FM': None,
308	'SUB': None,
309}
310
311def map_to_use(data):
312	out = {}
313	items = use_mapping.items()
314	for U,(UISC,UIPC,UGC,UBlock) in data.items():
315
316		# Resolve Indic_Syllabic_Category
317
318		# TODO: These don't have UISC assigned in Unicode 8.0, but have UIPC
319		if U == 0x17DD: UISC = Vowel_Dependent
320		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
321
322		# Tibetan:
323		# TODO: These don't have UISC assigned in Unicode 11.0, but have UIPC
324		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
325		if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
326		# Overrides to allow NFC order matching syllable
327		# https://github.com/harfbuzz/harfbuzz/issues/1012
328		if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
329			if UIPC == Top:
330				UIPC = Bottom
331
332		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
333		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
334		if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
335			if UIPC == Top:
336				UIPC = Bottom
337			elif UIPC == Bottom:
338				UIPC = Top
339
340		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
341		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
342
343		# TODO: U+1CED should only be allowed after some of
344		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
345		if U == 0x1CED: UISC = Tone_Mark
346
347		# TODO: https://github.com/harfbuzz/harfbuzz/issues/525
348		if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
349
350		# TODO: https://github.com/harfbuzz/harfbuzz/pull/609
351		if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
352
353		# TODO: https://github.com/harfbuzz/harfbuzz/pull/626
354		if U == 0xA8B4: UISC = Consonant_Medial
355
356		# TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
357		if U == 0x11134: UISC = Gemination_Mark
358
359		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1399
360		if U == 0x111C9: UISC = Consonant_Final
361
362		values = [k for k,v in items if v(U,UISC,UGC)]
363		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
364		USE = values[0]
365
366		# Resolve Indic_Positional_Category
367
368		# TODO: Not in Unicode 8.0 yet, but in spec.
369		if U == 0x1B6C: UIPC = Bottom
370
371		# TODO: These should die, but have UIPC in Unicode 8.0
372		if U in [0x953, 0x954]: UIPC = Not_Applicable
373
374		# TODO: In USE's override list but not in Unicode 11.0
375		if U == 0x103C: UIPC = Left
376
377		# TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0
378		if 0xA926 <= U <= 0xA92A: UIPC = Top
379		if U == 0x111CA: UIPC = Bottom
380		if U == 0x11300: UIPC = Top
381		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
382		if U == 0x11302: UIPC = Top
383		if U == 0x1133C: UIPC = Bottom
384		if U == 0x1171E: UIPC = Left # Correct?!
385		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
386		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
387		# https://github.com/roozbehp/unicode-data/issues/8
388		if U == 0x0A51: UIPC = Bottom
389
390		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
391			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
392
393		pos_mapping = use_positions.get(USE, None)
394		if pos_mapping:
395			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
396			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
397			USE = USE + values[0]
398
399		out[U] = (USE, UBlock)
400	return out
401
402defaults = ('O', 'No_Block')
403data = map_to_use(data)
404
405print ("/* == Start of generated table == */")
406print ("/*")
407print (" * The following table is generated by running:")
408print (" *")
409print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
410print (" *")
411print (" * on files with these headers:")
412print (" *")
413for h in headers:
414	for l in h:
415		print (" * %s" % (l.strip()))
416print (" */")
417print ()
418print ('#include "hb-ot-shape-complex-use.hh"')
419print ()
420
421total = 0
422used = 0
423last_block = None
424def print_block (block, start, end, data):
425	global total, used, last_block
426	if block and block != last_block:
427		print ()
428		print ()
429		print ("  /* %s */" % block)
430		if start % 16:
431			print (' ' * (20 + (start % 16 * 6)), end='')
432	num = 0
433	assert start % 8 == 0
434	assert (end+1) % 8 == 0
435	for u in range (start, end+1):
436		if u % 16 == 0:
437			print ()
438			print ("  /* %04X */" % u, end='')
439		if u in data:
440			num += 1
441		d = data.get (u, defaults)
442		print ("%6s," % d[0], end='')
443
444	total += end - start + 1
445	used += num
446	if block:
447		last_block = block
448
449uu = sorted (data.keys ())
450
451last = -100000
452num = 0
453offset = 0
454starts = []
455ends = []
456for k,v in sorted(use_mapping.items()):
457	if k in use_positions and use_positions[k]: continue
458	print ("#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:]))
459for k,v in sorted(use_positions.items()):
460	if not v: continue
461	for suf in v.keys():
462		tag = k + suf
463		print ("#define %s	USE_%s" % (tag, tag))
464print ("")
465print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
466for u in uu:
467	if u <= last:
468		continue
469	block = data[u][1]
470
471	start = u//8*8
472	end = start+1
473	while end in uu and block == data[end][1]:
474		end += 1
475	end = (end-1)//8*8 + 7
476
477	if start != last + 1:
478		if start - last <= 1+16*3:
479			print_block (None, last+1, start-1, data)
480			last = start-1
481		else:
482			if last >= 0:
483				ends.append (last + 1)
484				offset += ends[-1] - starts[-1]
485			print ()
486			print ()
487			print ("#define use_offset_0x%04xu %d" % (start, offset))
488			starts.append (start)
489
490	print_block (block, start, end, data)
491	last = end
492ends.append (last + 1)
493offset += ends[-1] - starts[-1]
494print ()
495print ()
496occupancy = used * 100. / total
497page_bits = 12
498print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
499print ()
500print ("USE_TABLE_ELEMENT_TYPE")
501print ("hb_use_get_category (hb_codepoint_t u)")
502print ("{")
503print ("  switch (u >> %d)" % page_bits)
504print ("  {")
505pages = set([u>>page_bits for u in starts+ends])
506for p in sorted(pages):
507	print ("    case 0x%0Xu:" % p)
508	for (start,end) in zip (starts, ends):
509		if p not in [start>>page_bits, end>>page_bits]: continue
510		offset = "use_offset_0x%04xu" % start
511		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
512	print ("      break;")
513	print ("")
514print ("    default:")
515print ("      break;")
516print ("  }")
517print ("  return USE_O;")
518print ("}")
519print ()
520for k in sorted(use_mapping.keys()):
521	if k in use_positions and use_positions[k]: continue
522	print ("#undef %s" % k)
523for k,v in sorted(use_positions.items()):
524	if not v: continue
525	for suf in v.keys():
526		tag = k + suf
527		print ("#undef %s" % tag)
528print ()
529print ("/* == End of generated table == */")
530
531# Maintain at least 50% occupancy in the table */
532if occupancy < 50:
533	raise Exception ("Table too sparse, please investigate: ", occupancy)
534