1 /*
2  * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <stdint.h>
20 #include "ucdn.h"
21 
22 typedef struct {
23     unsigned char category;
24     unsigned char combining;
25     unsigned char bidi_class;
26     unsigned char east_asian_width;
27     unsigned char script;
28     unsigned char linebreak_class;
29 } UCDRecord;
30 
31 typedef struct {
32     unsigned short from, to;
33 } MirrorPair;
34 
35 typedef struct {
36   unsigned short from, to;
37   unsigned char type;
38 } BracketPair;
39 
40 typedef struct {
41     unsigned int start;
42     short count, index;
43 } Reindex;
44 
45 #include "ucdn_db.h"
46 
47 /* constants required for Hangul (de)composition */
48 #define SBASE 0xAC00
49 #define LBASE 0x1100
50 #define VBASE 0x1161
51 #define TBASE 0x11A7
52 #define SCOUNT 11172
53 #define LCOUNT 19
54 #define VCOUNT 21
55 #define TCOUNT 28
56 #define NCOUNT (VCOUNT * TCOUNT)
57 
get_ucd_record(uint32_t code)58 static const UCDRecord *get_ucd_record(uint32_t code)
59 {
60     int index, offset;
61 
62     if (code >= 0x110000)
63         index = 0;
64     else {
65         index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
66         offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
67         index  = index1[index + offset] << SHIFT2;
68         offset = code & ((1<<SHIFT2) - 1);
69         index  = index2[index + offset];
70     }
71 
72     return &ucd_records[index];
73 }
74 
get_decomp_record(uint32_t code)75 static const unsigned short *get_decomp_record(uint32_t code)
76 {
77     int index, offset;
78 
79     if (code >= 0x110000)
80         index = 0;
81     else {
82         index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
83             << DECOMP_SHIFT1;
84         offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
85         index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
86         offset = code & ((1<<DECOMP_SHIFT2) - 1);
87         index  = decomp_index2[index + offset];
88     }
89 
90     return &decomp_data[index];
91 }
92 
compare_reindex(const void * a,const void * b)93 static int compare_reindex(const void *a, const void *b)
94 {
95     Reindex *ra = (Reindex *)a;
96     Reindex *rb = (Reindex *)b;
97 
98     if (ra->start < rb->start)
99         return -1;
100     else if (ra->start > (rb->start + rb->count))
101         return 1;
102     else
103         return 0;
104 }
105 
get_comp_index(uint32_t code,const Reindex * idx,size_t len)106 static int get_comp_index(uint32_t code, const Reindex *idx, size_t len)
107 {
108     Reindex *res;
109     Reindex r = {0, 0, 0};
110     r.start = code;
111     res = (Reindex *) bsearch(&r, idx, len, sizeof(Reindex), compare_reindex);
112 
113     if (res != NULL)
114         return res->index + (code - res->start);
115     else
116         return -1;
117 }
118 
compare_mp(const void * a,const void * b)119 static int compare_mp(const void *a, const void *b)
120 {
121     MirrorPair *mpa = (MirrorPair *)a;
122     MirrorPair *mpb = (MirrorPair *)b;
123     return mpa->from - mpb->from;
124 }
125 
compare_bp(const void * a,const void * b)126 static int compare_bp(const void *a, const void *b)
127 {
128     BracketPair *bpa = (BracketPair *)a;
129     BracketPair *bpb = (BracketPair *)b;
130     return bpa->from - bpb->from;
131 }
132 
search_bp(uint32_t code)133 static BracketPair *search_bp(uint32_t code)
134 {
135     BracketPair bp = {0,0,2};
136     BracketPair *res;
137 
138     bp.from = code;
139     res = (BracketPair *) bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN,
140                                  sizeof(BracketPair), compare_bp);
141     return res;
142 }
143 
hangul_pair_decompose(uint32_t code,uint32_t * a,uint32_t * b)144 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
145 {
146     int si = code - SBASE;
147 
148     if (si < 0 || si >= SCOUNT)
149         return 0;
150 
151     if (si % TCOUNT) {
152         /* LV,T */
153         *a = SBASE + (si / TCOUNT) * TCOUNT;
154         *b = TBASE + (si % TCOUNT);
155         return 3;
156     } else {
157         /* L,V */
158         *a = LBASE + (si / NCOUNT);
159         *b = VBASE + (si % NCOUNT) / TCOUNT;
160         return 2;
161     }
162 }
163 
hangul_pair_compose(uint32_t * code,uint32_t a,uint32_t b)164 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
165 {
166     if (a >= SBASE && a < (SBASE + SCOUNT) && b >= TBASE && b < (TBASE + TCOUNT)) {
167         /* LV,T */
168         *code = a + (b - TBASE);
169         return 3;
170     } else if (a >= LBASE && a < (LBASE + LCOUNT) && b >= VBASE && b < (VBASE + VCOUNT)) {
171         /* L,V */
172         int li = a - LBASE;
173         int vi = b - VBASE;
174         *code = SBASE + li * NCOUNT + vi * TCOUNT;
175         return 2;
176     } else {
177         return 0;
178     }
179 }
180 
decode_utf16(const unsigned short ** code_ptr)181 static uint32_t decode_utf16(const unsigned short **code_ptr)
182 {
183     const unsigned short *code = *code_ptr;
184 
185     if (code[0] < 0xd800 || code[0] > 0xdc00) {
186         *code_ptr += 1;
187         return (uint32_t)code[0];
188     } else {
189         *code_ptr += 2;
190         return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
191             (((uint32_t)code[0] - 0xd800) << 10);
192     }
193 }
194 
ucdn_get_unicode_version(void)195 const char *ucdn_get_unicode_version(void)
196 {
197     return UNIDATA_VERSION;
198 }
199 
ucdn_get_combining_class(uint32_t code)200 int ucdn_get_combining_class(uint32_t code)
201 {
202     return get_ucd_record(code)->combining;
203 }
204 
ucdn_get_east_asian_width(uint32_t code)205 int ucdn_get_east_asian_width(uint32_t code)
206 {
207     return get_ucd_record(code)->east_asian_width;
208 }
209 
ucdn_get_general_category(uint32_t code)210 int ucdn_get_general_category(uint32_t code)
211 {
212     return get_ucd_record(code)->category;
213 }
214 
ucdn_get_bidi_class(uint32_t code)215 int ucdn_get_bidi_class(uint32_t code)
216 {
217     return get_ucd_record(code)->bidi_class;
218 }
219 
ucdn_get_mirrored(uint32_t code)220 int ucdn_get_mirrored(uint32_t code)
221 {
222     return ucdn_mirror(code) != code;
223 }
224 
ucdn_get_script(uint32_t code)225 int ucdn_get_script(uint32_t code)
226 {
227     return get_ucd_record(code)->script;
228 }
229 
ucdn_get_linebreak_class(uint32_t code)230 int ucdn_get_linebreak_class(uint32_t code)
231 {
232     return get_ucd_record(code)->linebreak_class;
233 }
234 
ucdn_get_resolved_linebreak_class(uint32_t code)235 int ucdn_get_resolved_linebreak_class(uint32_t code)
236 {
237     const UCDRecord *record = get_ucd_record(code);
238 
239     switch (record->linebreak_class)
240     {
241     case UCDN_LINEBREAK_CLASS_AI:
242     case UCDN_LINEBREAK_CLASS_SG:
243     case UCDN_LINEBREAK_CLASS_XX:
244         return UCDN_LINEBREAK_CLASS_AL;
245 
246     case UCDN_LINEBREAK_CLASS_SA:
247         if (record->category == UCDN_GENERAL_CATEGORY_MC ||
248                 record->category == UCDN_GENERAL_CATEGORY_MN)
249             return UCDN_LINEBREAK_CLASS_CM;
250         return UCDN_LINEBREAK_CLASS_AL;
251 
252     case UCDN_LINEBREAK_CLASS_CJ:
253         return UCDN_LINEBREAK_CLASS_NS;
254 
255     case UCDN_LINEBREAK_CLASS_CB:
256         return UCDN_LINEBREAK_CLASS_B2;
257 
258     case UCDN_LINEBREAK_CLASS_NL:
259         return UCDN_LINEBREAK_CLASS_BK;
260 
261     default:
262         return record->linebreak_class;
263     }
264 }
265 
ucdn_mirror(uint32_t code)266 uint32_t ucdn_mirror(uint32_t code)
267 {
268     MirrorPair mp = {0};
269     MirrorPair *res;
270 
271     mp.from = code;
272     res = (MirrorPair *) bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN,
273                                 sizeof(MirrorPair), compare_mp);
274 
275     if (res == NULL)
276         return code;
277     else
278         return res->to;
279 }
280 
ucdn_paired_bracket(uint32_t code)281 uint32_t ucdn_paired_bracket(uint32_t code)
282 {
283     BracketPair *res = search_bp(code);
284     if (res == NULL)
285         return code;
286     else
287         return res->to;
288 }
289 
ucdn_paired_bracket_type(uint32_t code)290 int ucdn_paired_bracket_type(uint32_t code)
291 {
292     BracketPair *res = search_bp(code);
293     if (res == NULL)
294         return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE;
295     else
296         return res->type;
297 }
298 
ucdn_decompose(uint32_t code,uint32_t * a,uint32_t * b)299 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
300 {
301     const unsigned short *rec;
302     int len;
303 
304     if (hangul_pair_decompose(code, a, b))
305         return 1;
306 
307     rec = get_decomp_record(code);
308     len = rec[0] >> 8;
309 
310     if ((rec[0] & 0xff) != 0 || len == 0)
311         return 0;
312 
313     rec++;
314     *a = decode_utf16(&rec);
315     if (len > 1)
316         *b = decode_utf16(&rec);
317     else
318         *b = 0;
319 
320     return 1;
321 }
322 
ucdn_compose(uint32_t * code,uint32_t a,uint32_t b)323 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
324 {
325     int l, r, index, indexi, offset;
326 
327     if (hangul_pair_compose(code, a, b))
328         return 1;
329 
330     l = get_comp_index(a, nfc_first, sizeof(nfc_first) / sizeof(Reindex));
331     r = get_comp_index(b, nfc_last, sizeof(nfc_last) / sizeof(Reindex));
332 
333     if (l < 0 || r < 0)
334         return 0;
335 
336     indexi = l * TOTAL_LAST + r;
337     index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
338     offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
339     index  = comp_index1[index + offset] << COMP_SHIFT2;
340     offset = indexi & ((1<<COMP_SHIFT2) - 1);
341     *code  = comp_data[index + offset];
342 
343     return *code != 0;
344 }
345 
ucdn_compat_decompose(uint32_t code,uint32_t * decomposed)346 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
347 {
348     int i, len;
349     const unsigned short *rec = get_decomp_record(code);
350     len = rec[0] >> 8;
351 
352     if (len == 0)
353         return 0;
354 
355     rec++;
356     for (i = 0; i < len; i++)
357         decomposed[i] = decode_utf16(&rec);
358 
359     return len;
360 }
361