1#!/usr/bin/python
2
3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice
4versa.
5
6It creates a ``const LangTag[]``, matching the tags from the OpenType
7languages system tag list to the language subtags of the BCP 47 language
8subtag registry, with some manual adjustments. The mappings are
9supplemented with macrolanguages' sublanguages and retired codes'
10replacements, according to BCP 47 and some manual additions where BCP 47
11omits a retired code entirely.
12
13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16multiple BCP 47 tags) are listed here, except when the alphabetically
17first BCP 47 tag happens to be the chosen disambiguated tag. In that
18case, the fallback behavior will choose the right tag anyway.
19"""
20
21from __future__ import absolute_import, division, print_function, unicode_literals
22
23import collections
24try:
25	from HTMLParser import HTMLParser
26	def write (s):
27		print (s.encode ('utf-8'), end='')
28except ImportError:
29	from html.parser import HTMLParser
30	def write (s):
31		sys.stdout.flush ()
32		sys.stdout.buffer.write (s.encode ('utf-8'))
33import io
34import itertools
35import re
36import sys
37import unicodedata
38
39if len (sys.argv) != 3:
40	print ('usage: ./gen-tag-table.py languagetags language-subtag-registry', file=sys.stderr)
41	sys.exit (1)
42
43try:
44	from html import unescape
45	def html_unescape (parser, entity):
46		return unescape (entity)
47except ImportError:
48	def html_unescape (parser, entity):
49		return parser.unescape (entity)
50
51def expect (condition, message=None):
52	if not condition:
53		if message is None:
54			raise AssertionError
55		raise AssertionError (message)
56
57# from http://www-01.sil.org/iso639-3/iso-639-3.tab
58ISO_639_3_TO_1 = {
59	'aar': 'aa',
60	'abk': 'ab',
61	'afr': 'af',
62	'aka': 'ak',
63	'amh': 'am',
64	'ara': 'ar',
65	'arg': 'an',
66	'asm': 'as',
67	'ava': 'av',
68	'ave': 'ae',
69	'aym': 'ay',
70	'aze': 'az',
71	'bak': 'ba',
72	'bam': 'bm',
73	'bel': 'be',
74	'ben': 'bn',
75	'bis': 'bi',
76	'bod': 'bo',
77	'bos': 'bs',
78	'bre': 'br',
79	'bul': 'bg',
80	'cat': 'ca',
81	'ces': 'cs',
82	'cha': 'ch',
83	'che': 'ce',
84	'chu': 'cu',
85	'chv': 'cv',
86	'cor': 'kw',
87	'cos': 'co',
88	'cre': 'cr',
89	'cym': 'cy',
90	'dan': 'da',
91	'deu': 'de',
92	'div': 'dv',
93	'dzo': 'dz',
94	'ell': 'el',
95	'eng': 'en',
96	'epo': 'eo',
97	'est': 'et',
98	'eus': 'eu',
99	'ewe': 'ee',
100	'fao': 'fo',
101	'fas': 'fa',
102	'fij': 'fj',
103	'fin': 'fi',
104	'fra': 'fr',
105	'fry': 'fy',
106	'ful': 'ff',
107	'gla': 'gd',
108	'gle': 'ga',
109	'glg': 'gl',
110	'glv': 'gv',
111	'grn': 'gn',
112	'guj': 'gu',
113	'hat': 'ht',
114	'hau': 'ha',
115	'hbs': 'sh',
116	'heb': 'he',
117	'her': 'hz',
118	'hin': 'hi',
119	'hmo': 'ho',
120	'hrv': 'hr',
121	'hun': 'hu',
122	'hye': 'hy',
123	'ibo': 'ig',
124	'ido': 'io',
125	'iii': 'ii',
126	'iku': 'iu',
127	'ile': 'ie',
128	'ina': 'ia',
129	'ind': 'id',
130	'ipk': 'ik',
131	'isl': 'is',
132	'ita': 'it',
133	'jav': 'jv',
134	'jpn': 'ja',
135	'kal': 'kl',
136	'kan': 'kn',
137	'kas': 'ks',
138	'kat': 'ka',
139	'kau': 'kr',
140	'kaz': 'kk',
141	'khm': 'km',
142	'kik': 'ki',
143	'kin': 'rw',
144	'kir': 'ky',
145	'kom': 'kv',
146	'kon': 'kg',
147	'kor': 'ko',
148	'kua': 'kj',
149	'kur': 'ku',
150	'lao': 'lo',
151	'lat': 'la',
152	'lav': 'lv',
153	'lim': 'li',
154	'lin': 'ln',
155	'lit': 'lt',
156	'ltz': 'lb',
157	'lub': 'lu',
158	'lug': 'lg',
159	'mah': 'mh',
160	'mal': 'ml',
161	'mar': 'mr',
162	'mkd': 'mk',
163	'mlg': 'mg',
164	'mlt': 'mt',
165	'mol': 'mo',
166	'mon': 'mn',
167	'mri': 'mi',
168	'msa': 'ms',
169	'mya': 'my',
170	'nau': 'na',
171	'nav': 'nv',
172	'nbl': 'nr',
173	'nde': 'nd',
174	'ndo': 'ng',
175	'nep': 'ne',
176	'nld': 'nl',
177	'nno': 'nn',
178	'nob': 'nb',
179	'nor': 'no',
180	'nya': 'ny',
181	'oci': 'oc',
182	'oji': 'oj',
183	'ori': 'or',
184	'orm': 'om',
185	'oss': 'os',
186	'pan': 'pa',
187	'pli': 'pi',
188	'pol': 'pl',
189	'por': 'pt',
190	'pus': 'ps',
191	'que': 'qu',
192	'roh': 'rm',
193	'ron': 'ro',
194	'run': 'rn',
195	'rus': 'ru',
196	'sag': 'sg',
197	'san': 'sa',
198	'sin': 'si',
199	'slk': 'sk',
200	'slv': 'sl',
201	'sme': 'se',
202	'smo': 'sm',
203	'sna': 'sn',
204	'snd': 'sd',
205	'som': 'so',
206	'sot': 'st',
207	'spa': 'es',
208	'sqi': 'sq',
209	'srd': 'sc',
210	'srp': 'sr',
211	'ssw': 'ss',
212	'sun': 'su',
213	'swa': 'sw',
214	'swe': 'sv',
215	'tah': 'ty',
216	'tam': 'ta',
217	'tat': 'tt',
218	'tel': 'te',
219	'tgk': 'tg',
220	'tgl': 'tl',
221	'tha': 'th',
222	'tir': 'ti',
223	'ton': 'to',
224	'tsn': 'tn',
225	'tso': 'ts',
226	'tuk': 'tk',
227	'tur': 'tr',
228	'twi': 'tw',
229	'uig': 'ug',
230	'ukr': 'uk',
231	'urd': 'ur',
232	'uzb': 'uz',
233	'ven': 've',
234	'vie': 'vi',
235	'vol': 'vo',
236	'wln': 'wa',
237	'wol': 'wo',
238	'xho': 'xh',
239	'yid': 'yi',
240	'yor': 'yo',
241	'zha': 'za',
242	'zho': 'zh',
243	'zul': 'zu',
244}
245
246class LanguageTag (object):
247	"""A BCP 47 language tag.
248
249	Attributes:
250		subtags (List[str]): The list of subtags in this tag.
251		grandfathered (bool): Whether this tag is grandfathered. If
252			``true``, the entire lowercased tag is the ``language``
253			and the other subtag fields are empty.
254		language (str): The language subtag.
255		script (str): The script subtag.
256		region (str): The region subtag.
257		variant (str): The variant subtag.
258
259	Args:
260		tag (str): A BCP 47 language tag.
261
262	"""
263	def __init__ (self, tag):
264		global bcp_47
265		self.subtags = tag.lower ().split ('-')
266		self.grandfathered = tag.lower () in bcp_47.grandfathered
267		if self.grandfathered:
268			self.language = tag.lower ()
269			self.script = ''
270			self.region = ''
271			self.variant = ''
272		else:
273			self.language = self.subtags[0]
274			self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
275			self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
276			self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
277
278	def __str__(self):
279		return '-'.join(self.subtags)
280
281	def __repr__ (self):
282		return 'LanguageTag(%r)' % str(self)
283
284	@staticmethod
285	def _find_first (function, sequence):
286		try:
287			return next (iter (filter (function, sequence)))
288		except StopIteration:
289			return None
290
291	def is_complex (self):
292		"""Return whether this tag is too complex to represent as a
293		``LangTag`` in the generated code.
294
295		Complex tags need to be handled in
296		``hb_ot_tags_from_complex_language``.
297
298		Returns:
299			Whether this tag is complex.
300		"""
301		return not (len (self.subtags) == 1
302			or self.grandfathered
303			and len (self.subtags[1]) != 3
304			and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
305
306	def get_group (self):
307		"""Return the group into which this tag should be categorized in
308		``hb_ot_tags_from_complex_language``.
309
310		The group is the first letter of the tag, or ``'und'`` if this tag
311		should not be matched in a ``switch`` statement in the generated
312		code.
313
314		Returns:
315			This tag's group.
316		"""
317		return ('und'
318			if (self.language == 'und'
319				or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
320			else self.language[0])
321
322class OpenTypeRegistryParser (HTMLParser):
323	"""A parser for the OpenType language system tag registry.
324
325	Attributes:
326		header (str): The "last updated" line of the registry.
327		names (Mapping[str, str]): A map of language system tags to the
328			names they are given in the registry.
329		ranks (DefaultDict[str, int]): A map of language system tags to
330			numbers. If a single BCP 47 tag corresponds to multiple
331			OpenType tags, the tags are ordered in increasing order by
332			rank. The rank is based on the number of BCP 47 tags
333			associated with a tag, though it may be manually modified.
334		to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
335			OpenType language system tags to sets of BCP 47 tags.
336		from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
337			inverted. Its values start as unsorted sets;
338			``sort_languages`` converts them to sorted lists.
339
340	"""
341	def __init__ (self):
342		HTMLParser.__init__ (self)
343		self.header = ''
344		self.names = {}
345		self.ranks = collections.defaultdict (int)
346		self.to_bcp_47 = collections.defaultdict (set)
347		self.from_bcp_47 = collections.defaultdict (set)
348		# Whether the parser is in a <td> element
349		self._td = False
350		# The text of the <td> elements of the current <tr> element.
351		self._current_tr = []
352
353	def handle_starttag (self, tag, attrs):
354		if tag == 'meta':
355			for attr, value in attrs:
356				if attr == 'name' and value == 'updated_at':
357					self.header = self.get_starttag_text ()
358					break
359		elif tag == 'td':
360			self._td = True
361			self._current_tr.append ('')
362		elif tag == 'tr':
363			self._current_tr = []
364
365	def handle_endtag (self, tag):
366		if tag == 'td':
367			self._td = False
368		elif tag == 'tr' and self._current_tr:
369			expect (2 <= len (self._current_tr) <= 3)
370			name = self._current_tr[0].strip ()
371			tag = self._current_tr[1].strip ("\t\n\v\f\r '")
372			rank = 0
373			if len (tag) > 4:
374				expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
375				name += ' (deprecated)'
376				tag = tag.split (' ')[0]
377				rank = 1
378			self.names[tag] = re.sub (' languages$', '', name)
379			if not self._current_tr[2]:
380				return
381			iso_codes = self._current_tr[2].strip ()
382			self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
383			rank += 2 * len (self.to_bcp_47[tag])
384			self.ranks[tag] = rank
385
386	def handle_data (self, data):
387		if self._td:
388			self._current_tr[-1] += data
389
390	def handle_charref (self, name):
391		self.handle_data (html_unescape (self, '&#%s;' % name))
392
393	def handle_entityref (self, name):
394		self.handle_data (html_unescape (self, '&%s;' % name))
395
396	def parse (self, filename):
397		"""Parse the OpenType language system tag registry.
398
399		Args:
400			filename (str): The file name of the registry.
401		"""
402		with io.open (filename, encoding='utf-8') as f:
403			self.feed (f.read ())
404		expect (self.header)
405		for tag, iso_codes in self.to_bcp_47.items ():
406			for iso_code in iso_codes:
407				self.from_bcp_47[iso_code].add (tag)
408
409	def add_language (self, bcp_47_tag, ot_tag):
410		"""Add a language as if it were in the registry.
411
412		Args:
413			bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
414				a language subtag, and if the language subtag is a
415				macrolanguage, then new languages are added corresponding
416				to the macrolanguages' individual languages with the
417				remainder of the tag appended.
418			ot_tag (str): An OpenType language system tag.
419		"""
420		global bcp_47
421		self.to_bcp_47[ot_tag].add (bcp_47_tag)
422		self.from_bcp_47[bcp_47_tag].add (ot_tag)
423		if bcp_47_tag.lower () not in bcp_47.grandfathered:
424			try:
425				[macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
426				if macrolanguage in bcp_47.macrolanguages:
427					s = set ()
428					for language in bcp_47.macrolanguages[macrolanguage]:
429						if language.lower () not in bcp_47.grandfathered:
430							s.add ('%s-%s' % (language, suffix))
431					bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
432			except ValueError:
433				pass
434
435	@staticmethod
436	def _remove_language (tag_1, dict_1, dict_2):
437		for tag_2 in dict_1.pop (tag_1):
438			dict_2[tag_2].remove (tag_1)
439			if not dict_2[tag_2]:
440				del dict_2[tag_2]
441
442	def remove_language_ot (self, ot_tag):
443		"""Remove an OpenType tag from the registry.
444
445		Args:
446			ot_tag (str): An OpenType tag.
447		"""
448		self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
449
450	def remove_language_bcp_47 (self, bcp_47_tag):
451		"""Remove a BCP 47 tag from the registry.
452
453		Args:
454			bcp_47_tag (str): A BCP 47 tag.
455		"""
456		self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
457
458	def inherit_from_macrolanguages (self):
459		"""Copy mappings from macrolanguages to individual languages.
460
461		If a BCP 47 tag for an individual mapping has no OpenType
462		mapping but its macrolanguage does, the mapping is copied to
463		the individual language. For example, als (Tosk Albanian) has no
464		explicit mapping, so it inherits from sq (Albanian) the mapping
465		to SQI.
466
467		If a BCP 47 tag for a macrolanguage has no OpenType mapping but
468		all of its individual languages do and they all map to the same
469		tags, the mapping is copied to the macrolanguage.
470		"""
471		global bcp_47
472		original_ot_from_bcp_47 = dict (self.from_bcp_47)
473		for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
474			ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
475			if ot_macrolanguages:
476				for ot_macrolanguage in ot_macrolanguages:
477					for language in languages:
478						# Remove the following condition if e.g. nn should map to NYN,NOR
479						# instead of just NYN.
480						if language not in original_ot_from_bcp_47:
481							self.add_language (language, ot_macrolanguage)
482							self.ranks[ot_macrolanguage] += 1
483			else:
484				for language in languages:
485					if language in original_ot_from_bcp_47:
486						if ot_macrolanguages:
487							ml = original_ot_from_bcp_47[language]
488							if ml:
489								ot_macrolanguages &= ml
490							else:
491								pass
492						else:
493							ot_macrolanguages |= original_ot_from_bcp_47[language]
494					else:
495						ot_macrolanguages.clear ()
496					if not ot_macrolanguages:
497						break
498				for ot_macrolanguage in ot_macrolanguages:
499					self.add_language (macrolanguage, ot_macrolanguage)
500
501	def sort_languages (self):
502		"""Sort the values of ``from_bcp_47`` in ascending rank order."""
503		for language, tags in self.from_bcp_47.items ():
504			self.from_bcp_47[language] = sorted (tags,
505					key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
506
507ot = OpenTypeRegistryParser ()
508
509class BCP47Parser (object):
510	"""A parser for the BCP 47 subtag registry.
511
512	Attributes:
513		header (str): The "File-Date" line of the registry.
514		names (Mapping[str, str]): A map of subtags to the names they
515			are given in the registry. Each value is a
516			``'\\n'``-separated list of names.
517		scopes (Mapping[str, str]): A map of language subtags to strings
518			suffixed to language names, including suffixes to explain
519			language scopes.
520		macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
521			language subtags to the sets of language subtags which
522			inherit from them. See
523			``OpenTypeRegistryParser.inherit_from_macrolanguages``.
524		prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
525			subtags to their prefixes.
526		grandfathered (AbstractSet[str]): The set of grandfathered tags,
527			normalized to lowercase.
528
529	"""
530	def __init__ (self):
531		self.header = ''
532		self.names = {}
533		self.scopes = {}
534		self.macrolanguages = collections.defaultdict (set)
535		self.prefixes = collections.defaultdict (set)
536		self.grandfathered = set ()
537
538	def parse (self, filename):
539		"""Parse the BCP 47 subtag registry.
540
541		Args:
542			filename (str): The file name of the registry.
543		"""
544		with io.open (filename, encoding='utf-8') as f:
545			subtag_type = None
546			subtag = None
547			deprecated = False
548			has_preferred_value = False
549			line_buffer = ''
550			for line in itertools.chain (f, ['']):
551				line = line.rstrip ()
552				if line.startswith (' '):
553					line_buffer += line[1:]
554					continue
555				line, line_buffer = line_buffer, line
556				if line.startswith ('Type: '):
557					subtag_type = line.split (' ')[1]
558					deprecated = False
559					has_preferred_value = False
560				elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
561					subtag = line.split (' ')[1]
562					if subtag_type == 'grandfathered':
563						self.grandfathered.add (subtag.lower ())
564				elif line.startswith ('Description: '):
565					description = line.split (' ', 1)[1].replace (' (individual language)', '')
566					description = re.sub (' (\((individual |macro)language\)|languages)$', '',
567							description)
568					if subtag in self.names:
569						self.names[subtag] += '\n' + description
570					else:
571						self.names[subtag] = description
572				elif subtag_type == 'language' or subtag_type == 'grandfathered':
573					if line.startswith ('Scope: '):
574						scope = line.split (' ')[1]
575						if scope == 'macrolanguage':
576							scope = ' [macrolanguage]'
577						elif scope == 'collection':
578							scope = ' [family]'
579						else:
580							continue
581						self.scopes[subtag] = scope
582					elif line.startswith ('Deprecated: '):
583						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
584						deprecated = True
585					elif deprecated and line.startswith ('Comments: see '):
586						# If a subtag is split into multiple replacement subtags,
587						# it essentially represents a macrolanguage.
588						for language in line.replace (',', '').split (' ')[2:]:
589							self._add_macrolanguage (subtag, language)
590					elif line.startswith ('Preferred-Value: '):
591						# If a subtag is deprecated in favor of a single replacement subtag,
592						# it is either a dialect or synonym of the preferred subtag. Either
593						# way, it is close enough to the truth to consider the replacement
594						# the macrolanguage of the deprecated language.
595						has_preferred_value = True
596						macrolanguage = line.split (' ')[1]
597						self._add_macrolanguage (macrolanguage, subtag)
598					elif not has_preferred_value and line.startswith ('Macrolanguage: '):
599						self._add_macrolanguage (line.split (' ')[1], subtag)
600				elif subtag_type == 'variant':
601					if line.startswith ('Prefix: '):
602						self.prefixes[subtag].add (line.split (' ')[1])
603				elif line.startswith ('File-Date: '):
604					self.header = line
605		expect (self.header)
606
607	def _add_macrolanguage (self, macrolanguage, language):
608		global ot
609		if language not in ot.from_bcp_47:
610			for l in self.macrolanguages.get (language, set ()):
611				self._add_macrolanguage (macrolanguage, l)
612		if macrolanguage not in ot.from_bcp_47:
613			for ls in list (self.macrolanguages.values ()):
614				if macrolanguage in ls:
615					ls.add (language)
616					return
617		self.macrolanguages[macrolanguage].add (language)
618
619	def remove_extra_macrolanguages (self):
620		"""Make every language have at most one macrolanguage."""
621		inverted = collections.defaultdict (list)
622		for macrolanguage, languages in self.macrolanguages.items ():
623			for language in languages:
624				inverted[language].append (macrolanguage)
625		for language, macrolanguages in inverted.items ():
626			if len (macrolanguages) > 1:
627				macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
628				biggest_macrolanguage = macrolanguages.pop ()
629				for macrolanguage in macrolanguages:
630					self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
631
632	def get_name (self, lt):
633		"""Return the names of the subtags in a language tag.
634
635		Args:
636			lt (LanguageTag): A BCP 47 language tag.
637
638		Returns:
639			The name form of ``lt``.
640		"""
641		name = self.names[lt.language].split ('\n')[0]
642		if lt.script:
643			name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
644		if lt.region:
645			name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
646		if lt.variant:
647			name += '; ' + self.names[lt.variant].split ('\n')[0]
648		return name
649
650bcp_47 = BCP47Parser ()
651
652ot.parse (sys.argv[1])
653bcp_47.parse (sys.argv[2])
654
655ot.add_language ('ary', 'MOR')
656
657ot.add_language ('ath', 'ATH')
658
659ot.add_language ('bai', 'BML')
660
661ot.ranks['BAL'] = ot.ranks['KAR'] + 1
662
663ot.add_language ('ber', 'BBR')
664
665ot.remove_language_ot ('PGR')
666ot.add_language ('el-polyton', 'PGR')
667
668bcp_47.macrolanguages['et'] = {'ekk'}
669
670bcp_47.names['flm'] = 'Falam Chin'
671bcp_47.scopes['flm'] = ' (retired code)'
672bcp_47.macrolanguages['flm'] = {'cfm'}
673
674ot.ranks['FNE'] = ot.ranks['TNE'] + 1
675
676ot.add_language ('und-fonipa', 'IPPH')
677
678ot.add_language ('und-fonnapa', 'APPH')
679
680ot.remove_language_ot ('IRT')
681ot.add_language ('ga-Latg', 'IRT')
682
683ot.remove_language_ot ('KGE')
684ot.add_language ('und-Geok', 'KGE')
685
686ot.add_language ('guk', 'GUK')
687ot.names['GUK'] = 'Gumuz (SIL fonts)'
688ot.ranks['GUK'] = ot.ranks['GMZ'] + 1
689
690bcp_47.macrolanguages['id'] = {'in'}
691
692bcp_47.macrolanguages['ijo'] = {'ijc'}
693
694ot.add_language ('kht', 'KHN')
695ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
696ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)'
697ot.ranks['KHN'] = ot.ranks['KHT']
698ot.ranks['KHT'] += 1
699
700ot.ranks['LCR'] = ot.ranks['MCR'] + 1
701
702ot.names['MAL'] = 'Malayalam Traditional'
703ot.ranks['MLR'] += 1
704
705bcp_47.names['mhv'] = 'Arakanese'
706bcp_47.scopes['mhv'] = ' (retired code)'
707
708ot.add_language ('no', 'NOR')
709
710ot.add_language ('oc-provenc', 'PRO')
711
712ot.add_language ('qu', 'QUZ')
713ot.add_language ('qub', 'QWH')
714ot.add_language ('qud', 'QVI')
715ot.add_language ('qug', 'QVI')
716ot.add_language ('qup', 'QVI')
717ot.add_language ('qur', 'QWH')
718ot.add_language ('qus', 'QUH')
719ot.add_language ('quw', 'QVI')
720ot.add_language ('qux', 'QWH')
721ot.add_language ('qva', 'QWH')
722ot.add_language ('qvh', 'QWH')
723ot.add_language ('qvj', 'QVI')
724ot.add_language ('qvl', 'QWH')
725ot.add_language ('qvm', 'QWH')
726ot.add_language ('qvn', 'QWH')
727ot.add_language ('qvo', 'QVI')
728ot.add_language ('qvp', 'QWH')
729ot.add_language ('qvw', 'QWH')
730ot.add_language ('qvz', 'QVI')
731ot.add_language ('qwa', 'QWH')
732ot.add_language ('qws', 'QWH')
733ot.add_language ('qxa', 'QWH')
734ot.add_language ('qxc', 'QWH')
735ot.add_language ('qxh', 'QWH')
736ot.add_language ('qxl', 'QVI')
737ot.add_language ('qxn', 'QWH')
738ot.add_language ('qxo', 'QWH')
739ot.add_language ('qxr', 'QVI')
740ot.add_language ('qxt', 'QWH')
741ot.add_language ('qxw', 'QWH')
742
743bcp_47.macrolanguages['ro'].remove ('mo')
744bcp_47.macrolanguages['ro-MD'].add ('mo')
745
746ot.add_language ('sgw', 'SGW')
747ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)'
748ot.ranks['SGW'] = ot.ranks['CHG'] + 1
749
750ot.remove_language_ot ('SYRE')
751ot.remove_language_ot ('SYRJ')
752ot.remove_language_ot ('SYRN')
753ot.add_language ('und-Syre', 'SYRE')
754ot.add_language ('und-Syrj', 'SYRJ')
755ot.add_language ('und-Syrn', 'SYRN')
756
757bcp_47.names['xst'] = u"Silt'e"
758bcp_47.scopes['xst'] = ' (retired code)'
759bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
760
761ot.add_language ('xwo', 'TOD')
762
763ot.remove_language_ot ('ZHH')
764ot.remove_language_ot ('ZHP')
765ot.remove_language_ot ('ZHT')
766bcp_47.macrolanguages['zh'].remove ('lzh')
767bcp_47.macrolanguages['zh'].remove ('yue')
768ot.add_language ('zh-Hant-MO', 'ZHH')
769ot.add_language ('zh-Hant-HK', 'ZHH')
770ot.add_language ('zh-Hans', 'ZHS')
771ot.add_language ('zh-Hant', 'ZHT')
772ot.add_language ('zh-HK', 'ZHH')
773ot.add_language ('zh-MO', 'ZHH')
774ot.add_language ('zh-TW', 'ZHT')
775ot.add_language ('lzh', 'ZHT')
776ot.add_language ('lzh-Hans', 'ZHS')
777ot.add_language ('yue', 'ZHH')
778ot.add_language ('yue-Hans', 'ZHS')
779
780bcp_47.macrolanguages['zom'] = {'yos'}
781
782def rank_delta (bcp_47, ot):
783	"""Return a delta to apply to a BCP 47 tag's rank.
784
785	Most OpenType tags have a constant rank, but a few have ranks that
786	depend on the BCP 47 tag.
787
788	Args:
789		bcp_47 (str): A BCP 47 tag.
790		ot (str): An OpenType tag to.
791
792	Returns:
793		A number to add to ``ot``'s rank when sorting ``bcp_47``'s
794		OpenType equivalents.
795	"""
796	if bcp_47 == 'ak' and ot == 'AKA':
797		return -1
798	if bcp_47 == 'tw' and ot == 'TWI':
799		return -1
800	return 0
801
802disambiguation = {
803	'ALT': 'alt',
804	'ARK': 'rki',
805	'BHI': 'bhb',
806	'BLN': 'bjt',
807	'BTI': 'beb',
808	'CCHN': 'cco',
809	'CMR': 'swb',
810	'CPP': 'crp',
811	'CRR': 'crx',
812	'DUJ': 'dwu',
813	'ECR': 'crj',
814	'HAL': 'cfm',
815	'HND': 'hnd',
816	'KIS': 'kqs',
817	'LRC': 'bqi',
818	'NDB': 'nd',
819	'NIS': 'njz',
820	'PLG': 'pce',
821	'PRO': 'pro',
822	'QIN': 'bgr',
823	'QUH': 'quh',
824	'QVI': 'qvi',
825	'QWH': 'qwh',
826	'SIG': 'stv',
827	'TNE': 'yrk',
828	'ZHH': 'zh-HK',
829	'ZHS': 'zh-Hans',
830	'ZHT': 'zh-Hant',
831}
832
833ot.inherit_from_macrolanguages ()
834bcp_47.remove_extra_macrolanguages ()
835ot.inherit_from_macrolanguages ()
836ot.sort_languages ()
837
838print ('/* == Start of generated table == */')
839print ('/*')
840print (' * The following table is generated by running:')
841print (' *')
842print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
843print (' *')
844print (' * on files with these headers:')
845print (' *')
846print (' * %s' % ot.header.strip ())
847print (' * %s' % bcp_47.header)
848print (' */')
849print ()
850print ('#ifndef HB_OT_TAG_TABLE_HH')
851print ('#define HB_OT_TAG_TABLE_HH')
852print ()
853print ('static const LangTag ot_languages[] = {')
854
855def hb_tag (tag):
856	"""Convert a tag to ``HB_TAG`` form.
857
858	Args:
859		tag (str): An OpenType tag.
860
861	Returns:
862		A snippet of C++ representing ``tag``.
863	"""
864	return u"HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
865
866def get_variant_set (name):
867	"""Return a set of variant language names from a name.
868
869	Args:
870		name (str): A list of language names from the BCP 47 registry,
871			joined on ``'\\n'``.
872
873	Returns:
874		A set of normalized language names.
875	"""
876	return set (unicodedata.normalize ('NFD', n.replace ('\u2019', u"'"))
877			.encode ('ASCII', 'ignore')
878			.strip ()
879			for n in re.split ('[\n(),]', name) if n)
880
881def language_name_intersection (a, b):
882	"""Return the names in common between two language names.
883
884	Args:
885		a (str): A list of language names from the BCP 47 registry,
886			joined on ``'\\n'``.
887		b (str): A list of language names from the BCP 47 registry,
888			joined on ``'\\n'``.
889
890	Returns:
891		The normalized language names shared by ``a`` and ``b``.
892	"""
893	return get_variant_set (a).intersection (get_variant_set (b))
894
895def get_matching_language_name (intersection, candidates):
896	return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
897
898def same_tag (bcp_47_tag, ot_tags):
899	return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
900
901for language, tags in sorted (ot.from_bcp_47.items ()):
902	if language == '' or '-' in language:
903		continue
904	commented_out = same_tag (language, tags)
905	for i, tag in enumerate (tags, start=1):
906		print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else '  ', language, hb_tag (tag)), end='')
907		if commented_out:
908			print ('*/', end='')
909		print ('\t/* ', end='')
910		bcp_47_name = bcp_47.names.get (language, '')
911		bcp_47_name_candidates = bcp_47_name.split ('\n')
912		intersection = language_name_intersection (bcp_47_name, ot.names[tag])
913		scope = bcp_47.scopes.get (language, '')
914		if not intersection:
915			write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
916		else:
917			name = get_matching_language_name (intersection, bcp_47_name_candidates)
918			bcp_47.names[language] = name
919			write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope))
920		print (' */')
921
922print ('};')
923print ()
924
925print ('/**')
926print (' * hb_ot_tags_from_complex_language:')
927print (' * @lang_str: a BCP 47 language tag to convert.')
928print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
929print (' * conversion.')
930print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
931print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
932print (' * @tags: array of size at least @language_count to store the language tag')
933print (' * results')
934print (' *')
935print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
936print (' *')
937print (' * Return value: Whether any language systems were retrieved.')
938print (' **/')
939print ('static bool')
940print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
941print ('\t\t\t\t  const char   *limit,')
942print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
943print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
944print ('{')
945
946def print_subtag_matches (subtag, new_line):
947	if subtag:
948		if new_line:
949			print ()
950			print ('\t&& ', end='')
951		print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
952
953complex_tags = collections.defaultdict (list)
954for initial, group in itertools.groupby ((lt_tags for lt_tags in [
955			(LanguageTag (language), tags)
956			for language, tags in sorted (ot.from_bcp_47.items (),
957				key=lambda i: (-len (i[0]), i[0]))
958		] if lt_tags[0].is_complex ()),
959		key=lambda lt_tags: lt_tags[0].get_group ()):
960	complex_tags[initial] += group
961
962for initial, items in sorted (complex_tags.items ()):
963	if initial != 'und':
964		continue
965	for lt, tags in items:
966		if lt.variant in bcp_47.prefixes:
967			expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
968					'%s is not a valid prefix of %s' % (lt.language, lt.variant))
969		print ('  if (', end='')
970		print_subtag_matches (lt.script, False)
971		print_subtag_matches (lt.region, False)
972		print_subtag_matches (lt.variant, False)
973		print (')')
974		print ('  {')
975		write ('    /* %s */' % bcp_47.get_name (lt))
976		print ()
977		if len (tags) == 1:
978			write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
979			print ()
980			print ('    *count = 1;')
981		else:
982			print ('    hb_tag_t possible_tags[] = {')
983			for tag in tags:
984				write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
985				print ()
986			print ('    };')
987			print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
988			print ('      tags[i] = possible_tags[i];')
989			print ('    *count = i;')
990		print ('    return true;')
991		print ('  }')
992
993print ('  switch (lang_str[0])')
994print ('  {')
995for initial, items in sorted (complex_tags.items ()):
996	if initial == 'und':
997		continue
998	print ("  case '%s':" % initial)
999	for lt, tags in items:
1000		print ('    if (', end='')
1001		if lt.grandfathered:
1002			print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1003		else:
1004			string_literal = lt.language[1:] + '-'
1005			if lt.script:
1006				string_literal += lt.script
1007				lt.script = None
1008				if lt.region:
1009					string_literal += '-' + lt.region
1010					lt.region = None
1011			if string_literal[-1] == '-':
1012				print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1013			else:
1014				print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1015		print_subtag_matches (lt.script, True)
1016		print_subtag_matches (lt.region, True)
1017		print_subtag_matches (lt.variant, True)
1018		print (')')
1019		print ('    {')
1020		write ('      /* %s */' % bcp_47.get_name (lt))
1021		print ()
1022		if len (tags) == 1:
1023			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1024			print ()
1025			print ('      *count = 1;')
1026		else:
1027			print ('      unsigned int i;')
1028			print ('      hb_tag_t possible_tags[] = {')
1029			for tag in tags:
1030				write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1031				print ()
1032			print ('      };')
1033			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1034			print ('\ttags[i] = possible_tags[i];')
1035			print ('      *count = i;')
1036		print ('      return true;')
1037		print ('    }')
1038	print ('    break;')
1039
1040print ('  }')
1041print ('  return false;')
1042print ('}')
1043print ()
1044print ('/**')
1045print (' * hb_ot_ambiguous_tag_to_language')
1046print (' * @tag: A language tag.')
1047print (' *')
1048print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1049print (' * many language tags) and the best tag is not the alphabetically first, or if')
1050print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1051print (' * in #ot_languages.')
1052print (' *')
1053print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1054print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1055print (' **/')
1056print ('static hb_language_t')
1057print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1058print ('{')
1059print ('  switch (tag)')
1060print ('  {')
1061
1062def verify_disambiguation_dict ():
1063	"""Verify and normalize ``disambiguation``.
1064
1065	``disambiguation`` is a map of ambiguous OpenType language system
1066	tags to the particular BCP 47 tags they correspond to. This function
1067	checks that all its keys really are ambiguous and that each key's
1068	value is valid for that key. It checks that no ambiguous tag is
1069	missing, except when it can figure out which BCP 47 tag is the best
1070	by itself.
1071
1072	It modifies ``disambiguation`` to remove keys whose values are the
1073	same as those that the fallback would return anyway, and to add
1074	ambiguous keys whose disambiguations it determined automatically.
1075
1076	Raises:
1077		AssertionError: Verification failed.
1078	"""
1079	global bcp_47
1080	global disambiguation
1081	global ot
1082	for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1083		primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1084		if len (primary_tags) == 1:
1085			expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1086			if '-' in primary_tags[0]:
1087				disambiguation[ot_tag] = primary_tags[0]
1088		elif len (primary_tags) == 0:
1089			expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1090		else:
1091			macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1092			if len (macrolanguages) != 1:
1093				macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1094			if len (macrolanguages) != 1:
1095				macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1096			if len (macrolanguages) != 1:
1097				expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1098				expect (disambiguation[ot_tag] in bcp_47_tags,
1099						'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1100			elif ot_tag not in disambiguation:
1101				disambiguation[ot_tag] = macrolanguages[0]
1102			different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1103			if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]:
1104				del disambiguation[ot_tag]
1105	for ot_tag in disambiguation.keys ():
1106		expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1107
1108verify_disambiguation_dict ()
1109for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1110	write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1111	print ()
1112	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1113	print ()
1114
1115print ('  default:')
1116print ('    return HB_LANGUAGE_INVALID;')
1117print ('  }')
1118print ('}')
1119
1120print ()
1121print ('#endif /* HB_OT_TAG_TABLE_HH */')
1122print ()
1123print ('/* == End of generated table == */')
1124
1125