• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
3   *
4   * Permission to use, copy, modify, and/or distribute this software for any
5   * purpose with or without fee is hereby granted, provided that the above
6   * copyright notice and this permission notice appear in all copies.
7   *
8   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15   */
16  
17  #include <stdio.h>
18  #include <stdlib.h>
19  #include <stdint.h>
20  #include "ucdn.h"
21  
22  typedef struct {
23      unsigned char category;
24      unsigned char combining;
25      unsigned char bidi_class;
26      unsigned char mirrored;
27      unsigned char east_asian_width;
28      unsigned char script;
29      unsigned char linebreak_class;
30  } UCDRecord;
31  
32  typedef struct {
33      unsigned short from, to;
34  } MirrorPair;
35  
36  typedef struct {
37    unsigned short from, to;
38    unsigned char type;
39  } BracketPair;
40  
41  typedef struct {
42      unsigned int start;
43      short count, index;
44  } Reindex;
45  
46  #include "unicodedata_db.h"
47  
48  /* constants required for Hangul (de)composition */
49  #define SBASE 0xAC00
50  #define LBASE 0x1100
51  #define VBASE 0x1161
52  #define TBASE 0x11A7
53  #define SCOUNT 11172
54  #define LCOUNT 19
55  #define VCOUNT 21
56  #define TCOUNT 28
57  #define NCOUNT (VCOUNT * TCOUNT)
58  
get_ucd_record(uint32_t code)59  static const UCDRecord *get_ucd_record(uint32_t code)
60  {
61      int index, offset;
62  
63      if (code >= 0x110000)
64          index = 0;
65      else {
66          index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
67          offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
68          index  = index1[index + offset] << SHIFT2;
69          offset = code & ((1<<SHIFT2) - 1);
70          index  = index2[index + offset];
71      }
72  
73      return &ucd_records[index];
74  }
75  
get_decomp_record(uint32_t code)76  static const unsigned short *get_decomp_record(uint32_t code)
77  {
78      int index, offset;
79  
80      if (code >= 0x110000)
81          index = 0;
82      else {
83          index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
84              << DECOMP_SHIFT1;
85          offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
86          index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
87          offset = code & ((1<<DECOMP_SHIFT2) - 1);
88          index  = decomp_index2[index + offset];
89      }
90  
91      return &decomp_data[index];
92  }
93  
get_comp_index(uint32_t code,const Reindex * idx)94  static int get_comp_index(uint32_t code, const Reindex *idx)
95  {
96      int i;
97  
98      for (i = 0; idx[i].start; i++) {
99          const Reindex *cur = &idx[i];
100          if (code < cur->start)
101              return -1;
102          if (code <= cur->start + cur->count) {
103              return cur->index + (code - cur->start);
104          }
105      }
106  
107      return -1;
108  }
109  
compare_mp(const void * a,const void * b)110  static int compare_mp(const void *a, const void *b)
111  {
112      MirrorPair *mpa = (MirrorPair *)a;
113      MirrorPair *mpb = (MirrorPair *)b;
114      return mpa->from - mpb->from;
115  }
116  
compare_bp(const void * a,const void * b)117  static int compare_bp(const void *a, const void *b)
118  {
119      BracketPair *bpa = (BracketPair *)a;
120      BracketPair *bpb = (BracketPair *)b;
121      return bpa->from - bpb->from;
122  }
123  
search_bp(uint32_t code)124  static BracketPair *search_bp(uint32_t code)
125  {
126      BracketPair bp = {0,0,2};
127      BracketPair *res;
128  
129      bp.from = code;
130      res = bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN, sizeof(BracketPair),
131              compare_bp);
132      return res;
133  }
134  
hangul_pair_decompose(uint32_t code,uint32_t * a,uint32_t * b)135  static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
136  {
137      int si = code - SBASE;
138  
139      if (si < 0 || si >= SCOUNT)
140          return 0;
141  
142      if (si % TCOUNT) {
143          /* LV,T */
144          *a = SBASE + (si / TCOUNT) * TCOUNT;
145          *b = TBASE + (si % TCOUNT);
146          return 3;
147      } else {
148          /* L,V */
149          *a = LBASE + (si / NCOUNT);
150          *b = VBASE + (si % NCOUNT) / TCOUNT;
151          return 2;
152      }
153  }
154  
hangul_pair_compose(uint32_t * code,uint32_t a,uint32_t b)155  static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
156  {
157      if (b < VBASE || b >= (TBASE + TCOUNT))
158          return 0;
159  
160      if ((a < LBASE || a >= (LBASE + LCOUNT))
161              && (a < SBASE || a >= (SBASE + SCOUNT)))
162          return 0;
163  
164      if (a >= SBASE) {
165          /* LV,T */
166          *code = a + (b - TBASE);
167          return 3;
168      } else {
169          /* L,V */
170          int li = a - LBASE;
171          int vi = b - VBASE;
172          *code = SBASE + li * NCOUNT + vi * TCOUNT;
173          return 2;
174      }
175  }
176  
decode_utf16(const unsigned short ** code_ptr)177  static uint32_t decode_utf16(const unsigned short **code_ptr)
178  {
179      const unsigned short *code = *code_ptr;
180  
181      if ((code[0] & 0xd800) != 0xd800) {
182          *code_ptr += 1;
183          return (uint32_t)code[0];
184      } else {
185          *code_ptr += 2;
186          return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
187              (((uint32_t)code[0] - 0xd800) << 10);
188      }
189  }
190  
ucdn_get_unicode_version(void)191  const char *ucdn_get_unicode_version(void)
192  {
193      return UNIDATA_VERSION;
194  }
195  
ucdn_get_combining_class(uint32_t code)196  int ucdn_get_combining_class(uint32_t code)
197  {
198      return get_ucd_record(code)->combining;
199  }
200  
ucdn_get_east_asian_width(uint32_t code)201  int ucdn_get_east_asian_width(uint32_t code)
202  {
203      return get_ucd_record(code)->east_asian_width;
204  }
205  
ucdn_get_general_category(uint32_t code)206  int ucdn_get_general_category(uint32_t code)
207  {
208      return get_ucd_record(code)->category;
209  }
210  
ucdn_get_bidi_class(uint32_t code)211  int ucdn_get_bidi_class(uint32_t code)
212  {
213      return get_ucd_record(code)->bidi_class;
214  }
215  
ucdn_get_mirrored(uint32_t code)216  int ucdn_get_mirrored(uint32_t code)
217  {
218      return get_ucd_record(code)->mirrored;
219  }
220  
ucdn_get_script(uint32_t code)221  int ucdn_get_script(uint32_t code)
222  {
223      return get_ucd_record(code)->script;
224  }
225  
ucdn_get_linebreak_class(uint32_t code)226  int ucdn_get_linebreak_class(uint32_t code)
227  {
228      return get_ucd_record(code)->linebreak_class;
229  }
230  
ucdn_get_resolved_linebreak_class(uint32_t code)231  int ucdn_get_resolved_linebreak_class(uint32_t code)
232  {
233      const UCDRecord *record = get_ucd_record(code);
234  
235      switch (record->linebreak_class)
236      {
237      case UCDN_LINEBREAK_CLASS_AI:
238      case UCDN_LINEBREAK_CLASS_SG:
239      case UCDN_LINEBREAK_CLASS_XX:
240          return UCDN_LINEBREAK_CLASS_AL;
241  
242      case UCDN_LINEBREAK_CLASS_SA:
243          if (record->category == UCDN_GENERAL_CATEGORY_MC ||
244                  record->category == UCDN_GENERAL_CATEGORY_MN)
245              return UCDN_LINEBREAK_CLASS_CM;
246          return UCDN_LINEBREAK_CLASS_AL;
247  
248      case UCDN_LINEBREAK_CLASS_CJ:
249          return UCDN_LINEBREAK_CLASS_NS;
250  
251      case UCDN_LINEBREAK_CLASS_CB:
252          return UCDN_LINEBREAK_CLASS_B2;
253  
254      case UCDN_LINEBREAK_CLASS_NL:
255          return UCDN_LINEBREAK_CLASS_BK;
256  
257      default:
258          return record->linebreak_class;
259      }
260  }
261  
ucdn_mirror(uint32_t code)262  uint32_t ucdn_mirror(uint32_t code)
263  {
264      MirrorPair mp = {0};
265      MirrorPair *res;
266  
267      if (get_ucd_record(code)->mirrored == 0)
268          return code;
269  
270      mp.from = code;
271      res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
272              compare_mp);
273  
274      if (res == NULL)
275          return code;
276      else
277          return res->to;
278  }
279  
ucdn_paired_bracket(uint32_t code)280  uint32_t ucdn_paired_bracket(uint32_t code)
281  {
282      BracketPair *res = search_bp(code);
283      if (res == NULL)
284          return code;
285      else
286          return res->to;
287  }
288  
ucdn_paired_bracket_type(uint32_t code)289  int ucdn_paired_bracket_type(uint32_t code)
290  {
291      BracketPair *res = search_bp(code);
292      if (res == NULL)
293          return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE;
294      else
295          return res->type;
296  }
297  
ucdn_decompose(uint32_t code,uint32_t * a,uint32_t * b)298  int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
299  {
300      const unsigned short *rec;
301      int len;
302  
303      if (hangul_pair_decompose(code, a, b))
304          return 1;
305  
306      rec = get_decomp_record(code);
307      len = rec[0] >> 8;
308  
309      if ((rec[0] & 0xff) != 0 || len == 0)
310          return 0;
311  
312      rec++;
313      *a = decode_utf16(&rec);
314      if (len > 1)
315          *b = decode_utf16(&rec);
316      else
317          *b = 0;
318  
319      return 1;
320  }
321  
ucdn_compose(uint32_t * code,uint32_t a,uint32_t b)322  int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
323  {
324      int l, r, index, indexi, offset;
325  
326      if (hangul_pair_compose(code, a, b))
327          return 1;
328  
329      l = get_comp_index(a, nfc_first);
330      r = get_comp_index(b, nfc_last);
331  
332      if (l < 0 || r < 0)
333          return 0;
334  
335      indexi = l * TOTAL_LAST + r;
336      index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
337      offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
338      index  = comp_index1[index + offset] << COMP_SHIFT2;
339      offset = indexi & ((1<<COMP_SHIFT2) - 1);
340      *code  = comp_data[index + offset];
341  
342      return *code != 0;
343  }
344  
ucdn_compat_decompose(uint32_t code,uint32_t * decomposed)345  int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
346  {
347      int i, len;
348      const unsigned short *rec = get_decomp_record(code);
349      len = rec[0] >> 8;
350  
351      if (len == 0)
352          return 0;
353  
354      rec++;
355      for (i = 0; i < len; i++)
356          decomposed[i] = decode_utf16(&rec);
357  
358      return len;
359  }
360