1 /*
2 *******************************************************************************
3 *   Copyright (C) 2013-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  uscript_props.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2013feb16
12 *   created by: Markus W. Scherer
13 */
14 
15 #include "unicode/utypes.h"
16 #include "unicode/unistr.h"
17 #include "unicode/uscript.h"
18 #include "unicode/utf16.h"
19 #include "ustr_imp.h"
20 #include "cmemory.h"
21 
22 namespace {
23 
24 // Script metadata (script properties).
25 // See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt
26 
27 // 0 = NOT_ENCODED, no sample character, default false script properties.
28 // Bits 20.. 0: sample character
29 
30 // Bits 23..21: usage
31 const int32_t UNKNOWN = 1 << 21;
32 const int32_t EXCLUSION = 2 << 21;
33 const int32_t LIMITED_USE = 3 << 21;
34 const int32_t ASPIRATIONAL = 4 << 21;
35 const int32_t RECOMMENDED = 5 << 21;
36 
37 // Bits 31..24: Single-bit flags
38 const int32_t RTL = 1 << 24;
39 const int32_t LB_LETTERS = 1 << 25;
40 const int32_t CASED = 1 << 26;
41 
42 const int32_t SCRIPT_PROPS[] = {
43     // Begin copy-paste output from
44     // tools/trunk/unicode/py/parsescriptmetadata.py
45     0x0040 | RECOMMENDED,  // Zyyy
46     0x0308 | RECOMMENDED,  // Zinh
47     0x0628 | RECOMMENDED | RTL,  // Arab
48     0x0531 | RECOMMENDED | CASED,  // Armn
49     0x0995 | RECOMMENDED,  // Beng
50     0x3105 | RECOMMENDED | LB_LETTERS,  // Bopo
51     0x13C4 | LIMITED_USE | CASED,  // Cher
52     0x03E2 | EXCLUSION | CASED,  // Copt
53     0x042F | RECOMMENDED | CASED,  // Cyrl
54     0x10414 | EXCLUSION | CASED,  // Dsrt
55     0x0905 | RECOMMENDED,  // Deva
56     0x12A0 | RECOMMENDED,  // Ethi
57     0x10D3 | RECOMMENDED,  // Geor
58     0x10330 | EXCLUSION,  // Goth
59     0x03A9 | RECOMMENDED | CASED,  // Grek
60     0x0A95 | RECOMMENDED,  // Gujr
61     0x0A15 | RECOMMENDED,  // Guru
62     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hani
63     0xAC00 | RECOMMENDED,  // Hang
64     0x05D0 | RECOMMENDED | RTL,  // Hebr
65     0x304B | RECOMMENDED | LB_LETTERS,  // Hira
66     0x0C95 | RECOMMENDED,  // Knda
67     0x30AB | RECOMMENDED | LB_LETTERS,  // Kana
68     0x1780 | RECOMMENDED | LB_LETTERS,  // Khmr
69     0x0EA5 | RECOMMENDED | LB_LETTERS,  // Laoo
70     0x004C | RECOMMENDED | CASED,  // Latn
71     0x0D15 | RECOMMENDED,  // Mlym
72     0x1826 | ASPIRATIONAL,  // Mong
73     0x1000 | RECOMMENDED | LB_LETTERS,  // Mymr
74     0x168F | EXCLUSION,  // Ogam
75     0x10308 | EXCLUSION,  // Ital
76     0x0B15 | RECOMMENDED,  // Orya
77     0x16A0 | EXCLUSION,  // Runr
78     0x0D85 | RECOMMENDED,  // Sinh
79     0x0710 | LIMITED_USE | RTL,  // Syrc
80     0x0B95 | RECOMMENDED,  // Taml
81     0x0C15 | RECOMMENDED,  // Telu
82     0x078C | RECOMMENDED | RTL,  // Thaa
83     0x0E17 | RECOMMENDED | LB_LETTERS,  // Thai
84     0x0F40 | RECOMMENDED,  // Tibt
85     0x14C0 | ASPIRATIONAL,  // Cans
86     0xA288 | ASPIRATIONAL | LB_LETTERS,  // Yiii
87     0x1703 | EXCLUSION,  // Tglg
88     0x1723 | EXCLUSION,  // Hano
89     0x1743 | EXCLUSION,  // Buhd
90     0x1763 | EXCLUSION,  // Tagb
91     0x280E | UNKNOWN,  // Brai
92     0x10800 | EXCLUSION | RTL,  // Cprt
93     0x1900 | LIMITED_USE,  // Limb
94     0x10000 | EXCLUSION,  // Linb
95     0x10480 | EXCLUSION,  // Osma
96     0x10450 | EXCLUSION,  // Shaw
97     0x1950 | LIMITED_USE | LB_LETTERS,  // Tale
98     0x10380 | EXCLUSION,  // Ugar
99     0,
100     0x1A00 | EXCLUSION,  // Bugi
101     0x2C00 | EXCLUSION | CASED,  // Glag
102     0x10A00 | EXCLUSION | RTL,  // Khar
103     0xA800 | LIMITED_USE,  // Sylo
104     0x1980 | LIMITED_USE | LB_LETTERS,  // Talu
105     0x2D5E | ASPIRATIONAL,  // Tfng
106     0x103A0 | EXCLUSION,  // Xpeo
107     0x1B05 | LIMITED_USE,  // Bali
108     0x1BC0 | LIMITED_USE,  // Batk
109     0,
110     0x11005 | EXCLUSION,  // Brah
111     0xAA00 | LIMITED_USE,  // Cham
112     0,
113     0,
114     0,
115     0,
116     0x13153 | EXCLUSION,  // Egyp
117     0,
118     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hans
119     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hant
120     0x16B1C | EXCLUSION,  // Hmng
121     0x10CA1 | EXCLUSION | RTL | CASED,  // Hung
122     0,
123     0xA984 | LIMITED_USE,  // Java
124     0xA90A | LIMITED_USE,  // Kali
125     0,
126     0,
127     0x1C00 | LIMITED_USE,  // Lepc
128     0x10647 | EXCLUSION,  // Lina
129     0x0840 | LIMITED_USE | RTL,  // Mand
130     0,
131     0x10980 | EXCLUSION | RTL,  // Mero
132     0x07D8 | LIMITED_USE | RTL,  // Nkoo
133     0x10C00 | EXCLUSION | RTL,  // Orkh
134     0x1036B | EXCLUSION,  // Perm
135     0xA840 | EXCLUSION,  // Phag
136     0x10900 | EXCLUSION | RTL,  // Phnx
137     0x16F00 | ASPIRATIONAL,  // Plrd
138     0,
139     0,
140     0,
141     0,
142     0,
143     0,
144     0xA549 | LIMITED_USE,  // Vaii
145     0,
146     0x12000 | EXCLUSION,  // Xsux
147     0,
148     0xFDD0 | UNKNOWN,  // Zzzz
149     0x102B7 | EXCLUSION,  // Cari
150     0x304B | RECOMMENDED | LB_LETTERS,  // Jpan
151     0x1A20 | LIMITED_USE | LB_LETTERS,  // Lana
152     0x10280 | EXCLUSION,  // Lyci
153     0x10920 | EXCLUSION | RTL,  // Lydi
154     0x1C5A | LIMITED_USE,  // Olck
155     0xA930 | EXCLUSION,  // Rjng
156     0xA882 | LIMITED_USE,  // Saur
157     0x1D850 | EXCLUSION,  // Sgnw
158     0x1B83 | LIMITED_USE,  // Sund
159     0,
160     0xABC0 | LIMITED_USE,  // Mtei
161     0x10840 | EXCLUSION | RTL,  // Armi
162     0x10B00 | EXCLUSION | RTL,  // Avst
163     0x11103 | LIMITED_USE,  // Cakm
164     0xAC00 | RECOMMENDED,  // Kore
165     0x11083 | EXCLUSION,  // Kthi
166     0x10AC1 | EXCLUSION | RTL,  // Mani
167     0x10B60 | EXCLUSION | RTL,  // Phli
168     0x10B8F | EXCLUSION | RTL,  // Phlp
169     0,
170     0x10B40 | EXCLUSION | RTL,  // Prti
171     0x0800 | EXCLUSION | RTL,  // Samr
172     0xAA80 | LIMITED_USE | LB_LETTERS,  // Tavt
173     0,
174     0,
175     0xA6A0 | LIMITED_USE,  // Bamu
176     0xA4E8 | LIMITED_USE,  // Lisu
177     0,
178     0x10A60 | EXCLUSION | RTL,  // Sarb
179     0x16AE6 | EXCLUSION,  // Bass
180     0x1BC20 | EXCLUSION,  // Dupl
181     0x10500 | EXCLUSION,  // Elba
182     0x11315 | EXCLUSION,  // Gran
183     0,
184     0,
185     0x1E802 | EXCLUSION | RTL,  // Mend
186     0x109A0 | EXCLUSION | RTL,  // Merc
187     0x10A95 | EXCLUSION | RTL,  // Narb
188     0x10896 | EXCLUSION | RTL,  // Nbat
189     0x10873 | EXCLUSION | RTL,  // Palm
190     0x112BE | EXCLUSION,  // Sind
191     0x118B4 | EXCLUSION | CASED,  // Wara
192     0,
193     0,
194     0x16A4F | EXCLUSION,  // Mroo
195     0,
196     0x11183 | EXCLUSION,  // Shrd
197     0x110D0 | EXCLUSION,  // Sora
198     0x11680 | EXCLUSION,  // Takr
199     0,
200     0,
201     0x14400 | EXCLUSION,  // Hluw
202     0x11208 | EXCLUSION,  // Khoj
203     0x11484 | EXCLUSION,  // Tirh
204     0x10537 | EXCLUSION,  // Aghb
205     0x11152 | EXCLUSION,  // Mahj
206     0x11717 | EXCLUSION | LB_LETTERS,  // Ahom
207     0x108F4 | EXCLUSION | RTL,  // Hatr
208     0x1160E | EXCLUSION,  // Modi
209     0x1128F | EXCLUSION,  // Mult
210     0x11AC0 | EXCLUSION,  // Pauc
211     0x1158E | EXCLUSION,  // Sidd
212     // End copy-paste from parsescriptmetadata.py
213 };
214 
getScriptProps(UScriptCode script)215 int32_t getScriptProps(UScriptCode script) {
216     if (0 <= script && script < UPRV_LENGTHOF(SCRIPT_PROPS)) {
217         return SCRIPT_PROPS[script];
218     } else {
219         return 0;
220     }
221 }
222 
223 }  // namespace
224 
225 U_CAPI int32_t U_EXPORT2
uscript_getSampleString(UScriptCode script,UChar * dest,int32_t capacity,UErrorCode * pErrorCode)226 uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode) {
227     if(U_FAILURE(*pErrorCode)) { return 0; }
228     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
229         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
230         return 0;
231     }
232     int32_t sampleChar = getScriptProps(script) & 0x1fffff;
233     int32_t length;
234     if(sampleChar == 0) {
235         length = 0;
236     } else {
237         length = U16_LENGTH(sampleChar);
238         if(length <= capacity) {
239             int32_t i = 0;
240             U16_APPEND_UNSAFE(dest, i, sampleChar);
241         }
242     }
243     return u_terminateUChars(dest, capacity, length, pErrorCode);
244 }
245 
246 U_COMMON_API icu::UnicodeString U_EXPORT2
uscript_getSampleUnicodeString(UScriptCode script)247 uscript_getSampleUnicodeString(UScriptCode script) {
248     icu::UnicodeString sample;
249     int32_t sampleChar = getScriptProps(script) & 0x1fffff;
250     if(sampleChar != 0) {
251         sample.append(sampleChar);
252     }
253     return sample;
254 }
255 
256 U_CAPI UScriptUsage U_EXPORT2
uscript_getUsage(UScriptCode script)257 uscript_getUsage(UScriptCode script) {
258     return (UScriptUsage)((getScriptProps(script) >> 21) & 7);
259 }
260 
261 U_CAPI UBool U_EXPORT2
uscript_isRightToLeft(UScriptCode script)262 uscript_isRightToLeft(UScriptCode script) {
263     return (getScriptProps(script) & RTL) != 0;
264 }
265 
266 U_CAPI UBool U_EXPORT2
uscript_breaksBetweenLetters(UScriptCode script)267 uscript_breaksBetweenLetters(UScriptCode script) {
268     return (getScriptProps(script) & LB_LETTERS) != 0;
269 }
270 
271 U_CAPI UBool U_EXPORT2
uscript_isCased(UScriptCode script)272 uscript_isCased(UScriptCode script) {
273     return (getScriptProps(script) & CASED) != 0;
274 }
275