1from __future__ import (
2    print_function, division, absolute_import, unicode_literals)
3from fontTools.misc.py23 import *
4
5import re
6from bisect import bisect_right
7
8try:
9    # use unicodedata backport compatible with python2:
10    # https://github.com/mikekap/unicodedata2
11    from unicodedata2 import *
12except ImportError:  # pragma: no cover
13    # fall back to built-in unicodedata (possibly outdated)
14    from unicodedata import *
15
16from . import Blocks, Scripts, ScriptExtensions, OTTags
17
18
19__all__ = [tostr(s) for s in (
20    # names from built-in unicodedata module
21    "lookup",
22    "name",
23    "decimal",
24    "digit",
25    "numeric",
26    "category",
27    "bidirectional",
28    "combining",
29    "east_asian_width",
30    "mirrored",
31    "decomposition",
32    "normalize",
33    "unidata_version",
34    "ucd_3_2_0",
35    # additonal functions
36    "block",
37    "script",
38    "script_extension",
39    "script_name",
40    "script_code",
41    "script_horizontal_direction",
42    "ot_tags_from_script",
43    "ot_tag_to_script",
44)]
45
46
47def script(char):
48    """ Return the four-letter script code assigned to the Unicode character
49    'char' as string.
50
51    >>> script("a")
52    'Latn'
53    >>> script(",")
54    'Zyyy'
55    >>> script(unichr(0x10FFFF))
56    'Zzzz'
57    """
58    code = byteord(char)
59    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
60    # comes after (to the right of) any existing entries of x in a, and it
61    # partitions array a into two halves so that, for the left side
62    # all(val <= x for val in a[lo:i]), and for the right side
63    # all(val > x for val in a[i:hi]).
64    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
65    # breakpoints); we want to use `bisect_right` to look up the range that
66    # contains the given codepoint: i.e. whose start is less than or equal
67    # to the codepoint. Thus, we subtract -1 from the index returned.
68    i = bisect_right(Scripts.RANGES, code)
69    return Scripts.VALUES[i-1]
70
71
72def script_extension(char):
73    """ Return the script extension property assigned to the Unicode character
74    'char' as a set of string.
75
76    >>> script_extension("a") == {'Latn'}
77    True
78    >>> script_extension(unichr(0x060C)) == {'Arab', 'Rohg', 'Syrc', 'Thaa'}
79    True
80    >>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
81    True
82    """
83    code = byteord(char)
84    i = bisect_right(ScriptExtensions.RANGES, code)
85    value = ScriptExtensions.VALUES[i-1]
86    if value is None:
87        # code points not explicitly listed for Script Extensions
88        # have as their value the corresponding Script property value
89        return {script(char)}
90    return value
91
92
93def script_name(code, default=KeyError):
94    """ Return the long, human-readable script name given a four-letter
95    Unicode script code.
96
97    If no matching name is found, a KeyError is raised by default.
98
99    You can use the 'default' argument to return a fallback value (e.g.
100    'Unknown' or None) instead of throwing an error.
101    """
102    try:
103        return str(Scripts.NAMES[code].replace("_", " "))
104    except KeyError:
105        if isinstance(default, type) and issubclass(default, KeyError):
106            raise
107        return default
108
109
110_normalize_re = re.compile(r"[-_ ]+")
111
112
113def _normalize_property_name(string):
114    """Remove case, strip space, '-' and '_' for loose matching."""
115    return _normalize_re.sub("", string).lower()
116
117
118_SCRIPT_CODES = {_normalize_property_name(v): k
119                 for k, v in Scripts.NAMES.items()}
120
121
122def script_code(script_name, default=KeyError):
123    """Returns the four-letter Unicode script code from its long name
124
125    If no matching script code is found, a KeyError is raised by default.
126
127    You can use the 'default' argument to return a fallback string (e.g.
128    'Zzzz' or None) instead of throwing an error.
129    """
130    normalized_name = _normalize_property_name(script_name)
131    try:
132        return _SCRIPT_CODES[normalized_name]
133    except KeyError:
134        if isinstance(default, type) and issubclass(default, KeyError):
135            raise
136        return default
137
138
139# The data on script direction is taken from harfbuzz's "hb-common.cc":
140# https://goo.gl/X5FDXC
141# It matches the CLDR "scriptMetadata.txt as of January 2018:
142# http://unicode.org/repos/cldr/trunk/common/properties/scriptMetadata.txt
143RTL_SCRIPTS = {
144    # Unicode-1.1 additions
145    'Arab',  # Arabic
146    'Hebr',  # Hebrew
147
148    # Unicode-3.0 additions
149    'Syrc',  # Syriac
150    'Thaa',  # Thaana
151
152    # Unicode-4.0 additions
153    'Cprt',  # Cypriot
154
155    # Unicode-4.1 additions
156    'Khar',  # Kharoshthi
157
158    # Unicode-5.0 additions
159    'Phnx',  # Phoenician
160    'Nkoo',  # Nko
161
162    # Unicode-5.1 additions
163    'Lydi',  # Lydian
164
165    # Unicode-5.2 additions
166    'Avst',  # Avestan
167    'Armi',  # Imperial Aramaic
168    'Phli',  # Inscriptional Pahlavi
169    'Prti',  # Inscriptional Parthian
170    'Sarb',  # Old South Arabian
171    'Orkh',  # Old Turkic
172    'Samr',  # Samaritan
173
174    # Unicode-6.0 additions
175    'Mand',  # Mandaic
176
177    # Unicode-6.1 additions
178    'Merc',  # Meroitic Cursive
179    'Mero',  # Meroitic Hieroglyphs
180
181    # Unicode-7.0 additions
182    'Mani',  # Manichaean
183    'Mend',  # Mende Kikakui
184    'Nbat',  # Nabataean
185    'Narb',  # Old North Arabian
186    'Palm',  # Palmyrene
187    'Phlp',  # Psalter Pahlavi
188
189    # Unicode-8.0 additions
190    'Hatr',  # Hatran
191    'Hung',  # Old Hungarian
192
193    # Unicode-9.0 additions
194    'Adlm',  # Adlam
195}
196
197def script_horizontal_direction(script_code, default=KeyError):
198    """ Return "RTL" for scripts that contain right-to-left characters
199    according to the Bidi_Class property. Otherwise return "LTR".
200    """
201    if script_code not in Scripts.NAMES:
202        if isinstance(default, type) and issubclass(default, KeyError):
203            raise default(script_code)
204        return default
205    return str("RTL") if script_code in RTL_SCRIPTS else str("LTR")
206
207
208def block(char):
209    """ Return the block property assigned to the Unicode character 'char'
210    as a string.
211
212    >>> block("a")
213    'Basic Latin'
214    >>> block(unichr(0x060C))
215    'Arabic'
216    >>> block(unichr(0xEFFFF))
217    'No_Block'
218    """
219    code = byteord(char)
220    i = bisect_right(Blocks.RANGES, code)
221    return Blocks.VALUES[i-1]
222
223
224def ot_tags_from_script(script_code):
225    """ Return a list of OpenType script tags associated with a given
226    Unicode script code.
227    Return ['DFLT'] script tag for invalid/unknown script codes.
228    """
229    if script_code not in Scripts.NAMES:
230        return [OTTags.DEFAULT_SCRIPT]
231
232    script_tags = [
233        OTTags.SCRIPT_EXCEPTIONS.get(
234            script_code,
235            script_code[0].lower() + script_code[1:]
236        )
237    ]
238    if script_code in OTTags.NEW_SCRIPT_TAGS:
239        script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code])
240        script_tags.reverse()  # last in, first out
241
242    return script_tags
243
244
245def ot_tag_to_script(tag):
246    """ Return the Unicode script code for the given OpenType script tag, or
247    None for "DFLT" tag or if there is no Unicode script associated with it.
248    Raises ValueError if the tag is invalid.
249    """
250    tag = tostr(tag).strip()
251    if not tag or " " in tag or len(tag) > 4:
252        raise ValueError("invalid OpenType tag: %r" % tag)
253
254    while len(tag) != 4:
255        tag += str(" ")  # pad with spaces
256
257    if tag == OTTags.DEFAULT_SCRIPT:
258        # it's unclear which Unicode script the "DFLT" OpenType tag maps to,
259        # so here we return None
260        return None
261
262    if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED:
263        return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag]
264
265    # This side of the conversion is fully algorithmic
266
267    # Any spaces at the end of the tag are replaced by repeating the last
268    # letter. Eg 'nko ' -> 'Nkoo'.
269    # Change first char to uppercase
270    script_code = tag[0].upper() + tag[1]
271    for i in range(2, 4):
272        script_code += (script_code[i-1] if tag[i] == " " else tag[i])
273
274    if script_code not in Scripts.NAMES:
275        return None
276    return script_code
277