1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode 5.2 data base.
4 
5    Data was extracted from the Unicode 5.2 UnicodeData.txt file.
6 
7    Written by Marc-Andre Lemburg (mal@lemburg.com).
8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9    Modified by Martin v. L�wis (martin@v.loewis.de)
10 
11    Copyright (c) Corporation for National Research Initiatives.
12 
13    ------------------------------------------------------------------------ */
14 
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
18 
19 /* character properties */
20 
21 typedef struct {
22     const unsigned char category;       /* index into
23                                            _PyUnicode_CategoryNames */
24     const unsigned char combining;      /* combining class value 0 - 255 */
25     const unsigned char bidirectional;  /* index into
26                                            _PyUnicode_BidirectionalNames */
27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
28     const unsigned char east_asian_width;       /* index into
29                                                    _PyUnicode_EastAsianWidth */
30     const unsigned char normalization_quick_check; /* see is_normalized() */
31 } _PyUnicode_DatabaseRecord;
32 
33 typedef struct change_record {
34     /* sequence of fields should be the same as in merge_old_version */
35     const unsigned char bidir_changed;
36     const unsigned char category_changed;
37     const unsigned char decimal_changed;
38     const unsigned char mirrored_changed;
39     const double numeric_changed;
40 } change_record;
41 
42 /* data file generated by Tools/unicode/makeunicodedata.py */
43 #include "unicodedata_db.h"
44 
45 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)46 _getrecord_ex(Py_UCS4 code)
47 {
48     int index;
49     if (code >= 0x110000)
50         index = 0;
51     else {
52         index = index1[(code>>SHIFT)];
53         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54     }
55 
56     return &_PyUnicode_Database_Records[index];
57 }
58 
59 /* ------------- Previous-version API ------------------------------------- */
60 typedef struct previous_version {
61     PyObject_HEAD
62     const char *name;
63     const change_record* (*getrecord)(Py_UCS4);
64     Py_UCS4 (*normalization)(Py_UCS4);
65 } PreviousDBVersion;
66 
67 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
68 
69 static PyMemberDef DB_members[] = {
70         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
71         {NULL}
72 };
73 
74 /* forward declaration */
75 static PyTypeObject UCD_Type;
76 
77 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79                      Py_UCS4 (*normalization)(Py_UCS4))
80 {
81         PreviousDBVersion *self;
82         self = PyObject_New(PreviousDBVersion, &UCD_Type);
83         if (self == NULL)
84                 return NULL;
85         self->name = name;
86         self->getrecord = getrecord;
87         self->normalization = normalization;
88         return (PyObject*)self;
89 }
90 
91 
getuchar(PyUnicodeObject * obj)92 static Py_UCS4 getuchar(PyUnicodeObject *obj)
93 {
94     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95 
96     if (PyUnicode_GET_SIZE(obj) == 1)
97         return *v;
98 #ifndef Py_UNICODE_WIDE
99     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
102         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
103 #endif
104     PyErr_SetString(PyExc_TypeError,
105                     "need a single Unicode character as parameter");
106     return (Py_UCS4)-1;
107 }
108 
109 /* --- Module API --------------------------------------------------------- */
110 
111 PyDoc_STRVAR(unicodedata_decimal__doc__,
112 "decimal(unichr[, default])\n\
113 \n\
114 Returns the decimal value assigned to the Unicode character unichr\n\
115 as integer. If no such value is defined, default is returned, or, if\n\
116 not given, ValueError is raised.");
117 
118 static PyObject *
unicodedata_decimal(PyObject * self,PyObject * args)119 unicodedata_decimal(PyObject *self, PyObject *args)
120 {
121     PyUnicodeObject *v;
122     PyObject *defobj = NULL;
123     int have_old = 0;
124     long rc;
125     Py_UCS4 c;
126 
127     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
128         return NULL;
129     c = getuchar(v);
130     if (c == (Py_UCS4)-1)
131         return NULL;
132 
133     if (self) {
134         const change_record *old = get_old_record(self, c);
135         if (old->category_changed == 0) {
136             /* unassigned */
137             have_old = 1;
138             rc = -1;
139         }
140         else if (old->decimal_changed != 0xFF) {
141             have_old = 1;
142             rc = old->decimal_changed;
143         }
144     }
145 
146     if (!have_old)
147         rc = Py_UNICODE_TODECIMAL(c);
148     if (rc < 0) {
149         if (defobj == NULL) {
150             PyErr_SetString(PyExc_ValueError,
151                             "not a decimal");
152             return NULL;
153         }
154         else {
155             Py_INCREF(defobj);
156             return defobj;
157         }
158     }
159     return PyInt_FromLong(rc);
160 }
161 
162 PyDoc_STRVAR(unicodedata_digit__doc__,
163 "digit(unichr[, default])\n\
164 \n\
165 Returns the digit value assigned to the Unicode character unichr as\n\
166 integer. If no such value is defined, default is returned, or, if\n\
167 not given, ValueError is raised.");
168 
169 static PyObject *
unicodedata_digit(PyObject * self,PyObject * args)170 unicodedata_digit(PyObject *self, PyObject *args)
171 {
172     PyUnicodeObject *v;
173     PyObject *defobj = NULL;
174     long rc;
175     Py_UCS4 c;
176 
177     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
178         return NULL;
179     c = getuchar(v);
180     if (c == (Py_UCS4)-1)
181         return NULL;
182     rc = Py_UNICODE_TODIGIT(c);
183     if (rc < 0) {
184         if (defobj == NULL) {
185             PyErr_SetString(PyExc_ValueError, "not a digit");
186             return NULL;
187         }
188         else {
189             Py_INCREF(defobj);
190             return defobj;
191         }
192     }
193     return PyInt_FromLong(rc);
194 }
195 
196 PyDoc_STRVAR(unicodedata_numeric__doc__,
197 "numeric(unichr[, default])\n\
198 \n\
199 Returns the numeric value assigned to the Unicode character unichr\n\
200 as float. If no such value is defined, default is returned, or, if\n\
201 not given, ValueError is raised.");
202 
203 static PyObject *
unicodedata_numeric(PyObject * self,PyObject * args)204 unicodedata_numeric(PyObject *self, PyObject *args)
205 {
206     PyUnicodeObject *v;
207     PyObject *defobj = NULL;
208     int have_old = 0;
209     double rc;
210     Py_UCS4 c;
211 
212     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
213         return NULL;
214     c = getuchar(v);
215     if (c == (Py_UCS4)-1)
216         return NULL;
217 
218     if (self) {
219         const change_record *old = get_old_record(self, c);
220         if (old->category_changed == 0) {
221             /* unassigned */
222             have_old = 1;
223             rc = -1.0;
224         }
225         else if (old->decimal_changed != 0xFF) {
226             have_old = 1;
227             rc = old->decimal_changed;
228         }
229     }
230 
231     if (!have_old)
232         rc = Py_UNICODE_TONUMERIC(c);
233     if (rc == -1.0) {
234         if (defobj == NULL) {
235             PyErr_SetString(PyExc_ValueError, "not a numeric character");
236             return NULL;
237         }
238         else {
239             Py_INCREF(defobj);
240             return defobj;
241         }
242     }
243     return PyFloat_FromDouble(rc);
244 }
245 
246 PyDoc_STRVAR(unicodedata_category__doc__,
247 "category(unichr)\n\
248 \n\
249 Returns the general category assigned to the Unicode character\n\
250 unichr as string.");
251 
252 static PyObject *
unicodedata_category(PyObject * self,PyObject * args)253 unicodedata_category(PyObject *self, PyObject *args)
254 {
255     PyUnicodeObject *v;
256     int index;
257     Py_UCS4 c;
258 
259     if (!PyArg_ParseTuple(args, "O!:category",
260                           &PyUnicode_Type, &v))
261         return NULL;
262     c = getuchar(v);
263     if (c == (Py_UCS4)-1)
264         return NULL;
265     index = (int) _getrecord_ex(c)->category;
266     if (self) {
267         const change_record *old = get_old_record(self, c);
268         if (old->category_changed != 0xFF)
269             index = old->category_changed;
270     }
271     return PyString_FromString(_PyUnicode_CategoryNames[index]);
272 }
273 
274 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275 "bidirectional(unichr)\n\
276 \n\
277 Returns the bidirectional class assigned to the Unicode character\n\
278 unichr as string. If no such value is defined, an empty string is\n\
279 returned.");
280 
281 static PyObject *
unicodedata_bidirectional(PyObject * self,PyObject * args)282 unicodedata_bidirectional(PyObject *self, PyObject *args)
283 {
284     PyUnicodeObject *v;
285     int index;
286     Py_UCS4 c;
287 
288     if (!PyArg_ParseTuple(args, "O!:bidirectional",
289                           &PyUnicode_Type, &v))
290         return NULL;
291     c = getuchar(v);
292     if (c == (Py_UCS4)-1)
293         return NULL;
294     index = (int) _getrecord_ex(c)->bidirectional;
295     if (self) {
296         const change_record *old = get_old_record(self, c);
297         if (old->category_changed == 0)
298             index = 0; /* unassigned */
299         else if (old->bidir_changed != 0xFF)
300             index = old->bidir_changed;
301     }
302     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
303 }
304 
305 PyDoc_STRVAR(unicodedata_combining__doc__,
306 "combining(unichr)\n\
307 \n\
308 Returns the canonical combining class assigned to the Unicode\n\
309 character unichr as integer. Returns 0 if no combining class is\n\
310 defined.");
311 
312 static PyObject *
unicodedata_combining(PyObject * self,PyObject * args)313 unicodedata_combining(PyObject *self, PyObject *args)
314 {
315     PyUnicodeObject *v;
316     int index;
317     Py_UCS4 c;
318 
319     if (!PyArg_ParseTuple(args, "O!:combining",
320                           &PyUnicode_Type, &v))
321         return NULL;
322     c = getuchar(v);
323     if (c == (Py_UCS4)-1)
324         return NULL;
325     index = (int) _getrecord_ex(c)->combining;
326     if (self) {
327         const change_record *old = get_old_record(self, c);
328         if (old->category_changed == 0)
329             index = 0; /* unassigned */
330     }
331     return PyInt_FromLong(index);
332 }
333 
334 PyDoc_STRVAR(unicodedata_mirrored__doc__,
335 "mirrored(unichr)\n\
336 \n\
337 Returns the mirrored property assigned to the Unicode character\n\
338 unichr as integer. Returns 1 if the character has been identified as\n\
339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
340 
341 static PyObject *
unicodedata_mirrored(PyObject * self,PyObject * args)342 unicodedata_mirrored(PyObject *self, PyObject *args)
343 {
344     PyUnicodeObject *v;
345     int index;
346     Py_UCS4 c;
347 
348     if (!PyArg_ParseTuple(args, "O!:mirrored",
349                           &PyUnicode_Type, &v))
350         return NULL;
351     c = getuchar(v);
352     if (c == (Py_UCS4)-1)
353         return NULL;
354     index = (int) _getrecord_ex(c)->mirrored;
355     if (self) {
356         const change_record *old = get_old_record(self, c);
357         if (old->category_changed == 0)
358             index = 0; /* unassigned */
359         else if (old->mirrored_changed != 0xFF)
360             index = old->mirrored_changed;
361     }
362     return PyInt_FromLong(index);
363 }
364 
365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366 "east_asian_width(unichr)\n\
367 \n\
368 Returns the east asian width assigned to the Unicode character\n\
369 unichr as string.");
370 
371 static PyObject *
unicodedata_east_asian_width(PyObject * self,PyObject * args)372 unicodedata_east_asian_width(PyObject *self, PyObject *args)
373 {
374     PyUnicodeObject *v;
375     int index;
376     Py_UCS4 c;
377 
378     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
379                           &PyUnicode_Type, &v))
380         return NULL;
381     c = getuchar(v);
382     if (c == (Py_UCS4)-1)
383         return NULL;
384     index = (int) _getrecord_ex(c)->east_asian_width;
385     if (self) {
386         const change_record *old = get_old_record(self, c);
387         if (old->category_changed == 0)
388             index = 0; /* unassigned */
389     }
390     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
391 }
392 
393 PyDoc_STRVAR(unicodedata_decomposition__doc__,
394 "decomposition(unichr)\n\
395 \n\
396 Returns the character decomposition mapping assigned to the Unicode\n\
397 character unichr as string. An empty string is returned in case no\n\
398 such mapping is defined.");
399 
400 static PyObject *
unicodedata_decomposition(PyObject * self,PyObject * args)401 unicodedata_decomposition(PyObject *self, PyObject *args)
402 {
403     PyUnicodeObject *v;
404     char decomp[256];
405     int code, index, count, i;
406     unsigned int prefix_index;
407     Py_UCS4 c;
408 
409     if (!PyArg_ParseTuple(args, "O!:decomposition",
410                           &PyUnicode_Type, &v))
411         return NULL;
412     c = getuchar(v);
413     if (c == (Py_UCS4)-1)
414         return NULL;
415 
416     code = (int)c;
417 
418     if (self) {
419         const change_record *old = get_old_record(self, c);
420         if (old->category_changed == 0)
421             return PyString_FromString(""); /* unassigned */
422     }
423 
424     if (code < 0 || code >= 0x110000)
425         index = 0;
426     else {
427         index = decomp_index1[(code>>DECOMP_SHIFT)];
428         index = decomp_index2[(index<<DECOMP_SHIFT)+
429                              (code&((1<<DECOMP_SHIFT)-1))];
430     }
431 
432     /* high byte is number of hex bytes (usually one or two), low byte
433        is prefix code (from*/
434     count = decomp_data[index] >> 8;
435 
436     /* XXX: could allocate the PyString up front instead
437        (strlen(prefix) + 5 * count + 1 bytes) */
438 
439     /* Based on how index is calculated above and decomp_data is generated
440        from Tools/unicode/makeunicodedata.py, it should not be possible
441        to overflow decomp_prefix. */
442     prefix_index = decomp_data[index] & 255;
443     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444 
445     /* copy prefix */
446     i = strlen(decomp_prefix[prefix_index]);
447     memcpy(decomp, decomp_prefix[prefix_index], i);
448 
449     while (count-- > 0) {
450         if (i)
451             decomp[i++] = ' ';
452         assert((size_t)i < sizeof(decomp));
453         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454                       decomp_data[++index]);
455         i += strlen(decomp + i);
456     }
457 
458     decomp[i] = '\0';
459 
460     return PyString_FromString(decomp);
461 }
462 
463 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
465 {
466     if (code >= 0x110000) {
467         *index = 0;
468     } else if (self && get_old_record(self, code)->category_changed==0) {
469         /* unassigned in old version */
470         *index = 0;
471     }
472     else {
473         *index = decomp_index1[(code>>DECOMP_SHIFT)];
474         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475                                (code&((1<<DECOMP_SHIFT)-1))];
476     }
477 
478     /* high byte is number of hex bytes (usually one or two), low byte
479        is prefix code (from*/
480     *count = decomp_data[*index] >> 8;
481     *prefix = decomp_data[*index] & 255;
482 
483     (*index)++;
484 }
485 
486 #define SBase   0xAC00
487 #define LBase   0x1100
488 #define VBase   0x1161
489 #define TBase   0x11A7
490 #define LCount  19
491 #define VCount  21
492 #define TCount  28
493 #define NCount  (VCount*TCount)
494 #define SCount  (LCount*NCount)
495 
496 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)497 nfd_nfkd(PyObject *self, PyObject *input, int k)
498 {
499     PyObject *result;
500     Py_UNICODE *i, *end, *o;
501     /* Longest decomposition in Unicode 3.2: U+FDFA */
502     Py_UNICODE stack[20];
503     Py_ssize_t space, isize;
504     int index, prefix, count, stackptr;
505     unsigned char prev, cur;
506 
507     stackptr = 0;
508     isize = PyUnicode_GET_SIZE(input);
509     space = isize;
510     /* Overallocate at most 10 characters. */
511     if (space > 10) {
512         if (space <= PY_SSIZE_T_MAX - 10)
513             space += 10;
514     }
515     else {
516         space *= 2;
517     }
518     result = PyUnicode_FromUnicode(NULL, space);
519     if (!result)
520         return NULL;
521     i = PyUnicode_AS_UNICODE(input);
522     end = i + isize;
523     o = PyUnicode_AS_UNICODE(result);
524 
525     while (i < end) {
526         stack[stackptr++] = *i++;
527         while(stackptr) {
528             Py_UNICODE code = stack[--stackptr];
529             /* Hangul Decomposition adds three characters in
530                a single step, so we need at least that much room. */
531             if (space < 3) {
532                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
533                 space += 10;
534                 if (PyUnicode_Resize(&result, newsize) == -1)
535                     return NULL;
536                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
537             }
538             /* Hangul Decomposition. */
539             if (SBase <= code && code < (SBase+SCount)) {
540                 int SIndex = code - SBase;
541                 int L = LBase + SIndex / NCount;
542                 int V = VBase + (SIndex % NCount) / TCount;
543                 int T = TBase + SIndex % TCount;
544                 *o++ = L;
545                 *o++ = V;
546                 space -= 2;
547                 if (T != TBase) {
548                     *o++ = T;
549                     space --;
550                 }
551                 continue;
552             }
553             /* normalization changes */
554             if (self) {
555                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556                 if (value != 0) {
557                     stack[stackptr++] = value;
558                     continue;
559                 }
560             }
561 
562             /* Other decompositions. */
563             get_decomp_record(self, code, &index, &prefix, &count);
564 
565             /* Copy character if it is not decomposable, or has a
566                compatibility decomposition, but we do NFD. */
567             if (!count || (prefix && !k)) {
568                 *o++ = code;
569                 space--;
570                 continue;
571             }
572             /* Copy decomposition onto the stack, in reverse
573                order.  */
574             while(count) {
575                 code = decomp_data[index + (--count)];
576                 stack[stackptr++] = code;
577             }
578         }
579     }
580 
581     /* Drop overallocation. Cannot fail. */
582     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
583 
584     /* Sort canonically. */
585     i = PyUnicode_AS_UNICODE(result);
586     prev = _getrecord_ex(*i)->combining;
587     end = i + PyUnicode_GET_SIZE(result);
588     for (i++; i < end; i++) {
589         cur = _getrecord_ex(*i)->combining;
590         if (prev == 0 || cur == 0 || prev <= cur) {
591             prev = cur;
592             continue;
593         }
594         /* Non-canonical order. Need to switch *i with previous. */
595         o = i - 1;
596         while (1) {
597             Py_UNICODE tmp = o[1];
598             o[1] = o[0];
599             o[0] = tmp;
600             o--;
601             if (o < PyUnicode_AS_UNICODE(result))
602                 break;
603             prev = _getrecord_ex(*o)->combining;
604             if (prev == 0 || prev <= cur)
605                 break;
606         }
607         prev = _getrecord_ex(*i)->combining;
608     }
609     return result;
610 }
611 
612 static int
find_nfc_index(PyObject * self,struct reindex * nfc,Py_UNICODE code)613 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
614 {
615     int index;
616     for (index = 0; nfc[index].start; index++) {
617         int start = nfc[index].start;
618         if (code < start)
619             return -1;
620         if (code <= start + nfc[index].count) {
621             int delta = code - start;
622             return nfc[index].index + delta;
623         }
624     }
625     return -1;
626 }
627 
628 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)629 nfc_nfkc(PyObject *self, PyObject *input, int k)
630 {
631     PyObject *result;
632     Py_UNICODE *i, *i1, *o, *end;
633     int f,l,index,index1,comb;
634     Py_UNICODE code;
635     Py_UNICODE *skipped[20];
636     int cskipped = 0;
637 
638     result = nfd_nfkd(self, input, k);
639     if (!result)
640         return NULL;
641 
642     /* We are going to modify result in-place.
643        If nfd_nfkd is changed to sometimes return the input,
644        this code needs to be reviewed. */
645     assert(result != input);
646 
647     i = PyUnicode_AS_UNICODE(result);
648     end = i + PyUnicode_GET_SIZE(result);
649     o = PyUnicode_AS_UNICODE(result);
650 
651   again:
652     while (i < end) {
653       for (index = 0; index < cskipped; index++) {
654           if (skipped[index] == i) {
655               /* *i character is skipped.
656                  Remove from list. */
657               skipped[index] = skipped[cskipped-1];
658               cskipped--;
659               i++;
660               goto again; /* continue while */
661           }
662       }
663       /* Hangul Composition. We don't need to check for <LV,T>
664          pairs, since we always have decomposed data. */
665       if (LBase <= *i && *i < (LBase+LCount) &&
666           i + 1 < end &&
667           VBase <= i[1] && i[1] < (VBase+VCount)) {
668           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
669              and V character is a modern vowel (0x1161 ~ 0x1175). */
670           int LIndex, VIndex;
671           LIndex = i[0] - LBase;
672           VIndex = i[1] - VBase;
673           code = SBase + (LIndex*VCount+VIndex)*TCount;
674           i+=2;
675           if (i < end &&
676               TBase < *i && *i < (TBase+TCount)) {
677               /* check T character is a modern trailing consonant
678                  (0x11A8 ~ 0x11C2). */
679               code += *i-TBase;
680               i++;
681           }
682           *o++ = code;
683           continue;
684       }
685 
686       f = find_nfc_index(self, nfc_first, *i);
687       if (f == -1) {
688           *o++ = *i++;
689           continue;
690       }
691       /* Find next unblocked character. */
692       i1 = i+1;
693       comb = 0;
694       while (i1 < end) {
695           int comb1 = _getrecord_ex(*i1)->combining;
696           if (comb) {
697               if (comb1 == 0)
698                   break;
699               if (comb >= comb1) {
700                   /* Character is blocked. */
701                   i1++;
702                   continue;
703               }
704           }
705           l = find_nfc_index(self, nfc_last, *i1);
706           /* *i1 cannot be combined with *i. If *i1
707              is a starter, we don't need to look further.
708              Otherwise, record the combining class. */
709           if (l == -1) {
710             not_combinable:
711               if (comb1 == 0)
712                   break;
713               comb = comb1;
714               i1++;
715               continue;
716           }
717           index = f*TOTAL_LAST + l;
718           index1 = comp_index[index >> COMP_SHIFT];
719           code = comp_data[(index1<<COMP_SHIFT)+
720                            (index&((1<<COMP_SHIFT)-1))];
721           if (code == 0)
722               goto not_combinable;
723 
724           /* Replace the original character. */
725           *i = code;
726           /* Mark the second character unused. */
727           assert(cskipped < 20);
728           skipped[cskipped++] = i1;
729           i1++;
730           f = find_nfc_index(self, nfc_first, *i);
731           if (f == -1)
732               break;
733       }
734       *o++ = *i++;
735     }
736     if (o != end)
737         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
738     return result;
739 }
740 
741 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
742 static int
is_normalized(PyObject * self,PyObject * input,int nfc,int k)743 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
744 {
745     Py_UNICODE *i, *end;
746     unsigned char prev_combining = 0, quickcheck_mask;
747 
748     /* An older version of the database is requested, quickchecks must be
749        disabled. */
750     if (self != NULL)
751         return 0;
752 
753     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
754        as described in http://unicode.org/reports/tr15/#Annex8. */
755     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
756 
757     i = PyUnicode_AS_UNICODE(input);
758     end = i + PyUnicode_GET_SIZE(input);
759     while (i < end) {
760         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
761         unsigned char combining = record->combining;
762         unsigned char quickcheck = record->normalization_quick_check;
763 
764         if (quickcheck & quickcheck_mask)
765             return 0; /* this string might need normalization */
766         if (combining && prev_combining > combining)
767             return 0; /* non-canonical sort order, not normalized */
768         prev_combining = combining;
769     }
770     return 1; /* certainly normalized */
771 }
772 
773 PyDoc_STRVAR(unicodedata_normalize__doc__,
774 "normalize(form, unistr)\n\
775 \n\
776 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
777 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
778 
779 static PyObject*
unicodedata_normalize(PyObject * self,PyObject * args)780 unicodedata_normalize(PyObject *self, PyObject *args)
781 {
782     char *form;
783     PyObject *input;
784 
785     if(!PyArg_ParseTuple(args, "sO!:normalize",
786                          &form, &PyUnicode_Type, &input))
787         return NULL;
788 
789     if (PyUnicode_GetSize(input) == 0) {
790         /* Special case empty input strings, since resizing
791            them  later would cause internal errors. */
792         Py_INCREF(input);
793         return input;
794     }
795 
796     if (strcmp(form, "NFC") == 0) {
797         if (is_normalized(self, input, 1, 0)) {
798             Py_INCREF(input);
799             return input;
800         }
801         return nfc_nfkc(self, input, 0);
802     }
803     if (strcmp(form, "NFKC") == 0) {
804         if (is_normalized(self, input, 1, 1)) {
805             Py_INCREF(input);
806             return input;
807         }
808         return nfc_nfkc(self, input, 1);
809     }
810     if (strcmp(form, "NFD") == 0) {
811         if (is_normalized(self, input, 0, 0)) {
812             Py_INCREF(input);
813             return input;
814         }
815         return nfd_nfkd(self, input, 0);
816     }
817     if (strcmp(form, "NFKD") == 0) {
818         if (is_normalized(self, input, 0, 1)) {
819             Py_INCREF(input);
820             return input;
821         }
822         return nfd_nfkd(self, input, 1);
823     }
824     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
825     return NULL;
826 }
827 
828 /* -------------------------------------------------------------------- */
829 /* unicode character name tables */
830 
831 /* data file generated by Tools/unicode/makeunicodedata.py */
832 #include "unicodename_db.h"
833 
834 /* -------------------------------------------------------------------- */
835 /* database code (cut and pasted from the unidb package) */
836 
837 static unsigned long
_gethash(const char * s,int len,int scale)838 _gethash(const char *s, int len, int scale)
839 {
840     int i;
841     unsigned long h = 0;
842     unsigned long ix;
843     for (i = 0; i < len; i++) {
844         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
845         ix = h & 0xff000000;
846         if (ix)
847             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
848     }
849     return h;
850 }
851 
852 static char *hangul_syllables[][3] = {
853     { "G",  "A",   ""   },
854     { "GG", "AE",  "G"  },
855     { "N",  "YA",  "GG" },
856     { "D",  "YAE", "GS" },
857     { "DD", "EO",  "N", },
858     { "R",  "E",   "NJ" },
859     { "M",  "YEO", "NH" },
860     { "B",  "YE",  "D"  },
861     { "BB", "O",   "L"  },
862     { "S",  "WA",  "LG" },
863     { "SS", "WAE", "LM" },
864     { "",   "OE",  "LB" },
865     { "J",  "YO",  "LS" },
866     { "JJ", "U",   "LT" },
867     { "C",  "WEO", "LP" },
868     { "K",  "WE",  "LH" },
869     { "T",  "WI",  "M"  },
870     { "P",  "YU",  "B"  },
871     { "H",  "EU",  "BS" },
872     { 0,    "YI",  "S"  },
873     { 0,    "I",   "SS" },
874     { 0,    0,     "NG" },
875     { 0,    0,     "J"  },
876     { 0,    0,     "C"  },
877     { 0,    0,     "K"  },
878     { 0,    0,     "T"  },
879     { 0,    0,     "P"  },
880     { 0,    0,     "H"  }
881 };
882 
883 static int
is_unified_ideograph(Py_UCS4 code)884 is_unified_ideograph(Py_UCS4 code)
885 {
886     return (
887         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
888         (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
889         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
890         (0x2A700 <= code && code <= 0x2B734));  /* CJK Ideograph Extension C */
891 }
892 
893 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen)894 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
895 {
896     int offset;
897     int i;
898     int word;
899     unsigned char* w;
900 
901     if (code >= 0x110000)
902         return 0;
903 
904     if (self) {
905         const change_record *old = get_old_record(self, code);
906         if (old->category_changed == 0) {
907             /* unassigned */
908             return 0;
909         }
910     }
911 
912     if (SBase <= code && code < SBase+SCount) {
913         /* Hangul syllable. */
914         int SIndex = code - SBase;
915         int L = SIndex / NCount;
916         int V = (SIndex % NCount) / TCount;
917         int T = SIndex % TCount;
918 
919         if (buflen < 27)
920             /* Worst case: HANGUL SYLLABLE <10chars>. */
921             return 0;
922         strcpy(buffer, "HANGUL SYLLABLE ");
923         buffer += 16;
924         strcpy(buffer, hangul_syllables[L][0]);
925         buffer += strlen(hangul_syllables[L][0]);
926         strcpy(buffer, hangul_syllables[V][1]);
927         buffer += strlen(hangul_syllables[V][1]);
928         strcpy(buffer, hangul_syllables[T][2]);
929         buffer += strlen(hangul_syllables[T][2]);
930         *buffer = '\0';
931         return 1;
932     }
933 
934     if (is_unified_ideograph(code)) {
935         if (buflen < 28)
936             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
937             return 0;
938         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
939         return 1;
940     }
941 
942     /* get offset into phrasebook */
943     offset = phrasebook_offset1[(code>>phrasebook_shift)];
944     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
945                                (code&((1<<phrasebook_shift)-1))];
946     if (!offset)
947         return 0;
948 
949     i = 0;
950 
951     for (;;) {
952         /* get word index */
953         word = phrasebook[offset] - phrasebook_short;
954         if (word >= 0) {
955             word = (word << 8) + phrasebook[offset+1];
956             offset += 2;
957         } else
958             word = phrasebook[offset++];
959         if (i) {
960             if (i > buflen)
961                 return 0; /* buffer overflow */
962             buffer[i++] = ' ';
963         }
964         /* copy word string from lexicon.  the last character in the
965            word has bit 7 set.  the last word in a string ends with
966            0x80 */
967         w = lexicon + lexicon_offset[word];
968         while (*w < 128) {
969             if (i >= buflen)
970                 return 0; /* buffer overflow */
971             buffer[i++] = *w++;
972         }
973         if (i >= buflen)
974             return 0; /* buffer overflow */
975         buffer[i++] = *w & 127;
976         if (*w == 128)
977             break; /* end of word */
978     }
979 
980     return 1;
981 }
982 
983 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)984 _cmpname(PyObject *self, int code, const char* name, int namelen)
985 {
986     /* check if code corresponds to the given name */
987     int i;
988     char buffer[NAME_MAXLEN];
989     if (!_getucname(self, code, buffer, sizeof(buffer)))
990         return 0;
991     for (i = 0; i < namelen; i++) {
992         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
993             return 0;
994     }
995     return buffer[namelen] == '\0';
996 }
997 
998 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)999 find_syllable(const char *str, int *len, int *pos, int count, int column)
1000 {
1001     int i, len1;
1002     *len = -1;
1003     for (i = 0; i < count; i++) {
1004         char *s = hangul_syllables[i][column];
1005         len1 = strlen(s);
1006         if (len1 <= *len)
1007             continue;
1008         if (strncmp(str, s, len1) == 0) {
1009             *len = len1;
1010             *pos = i;
1011         }
1012     }
1013     if (*len == -1) {
1014         *len = 0;
1015     }
1016 }
1017 
1018 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code)1019 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1020 {
1021     unsigned int h, v;
1022     unsigned int mask = code_size-1;
1023     unsigned int i, incr;
1024 
1025     /* Check for hangul syllables. */
1026     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1027         int len, L = -1, V = -1, T = -1;
1028         const char *pos = name + 16;
1029         find_syllable(pos, &len, &L, LCount, 0);
1030         pos += len;
1031         find_syllable(pos, &len, &V, VCount, 1);
1032         pos += len;
1033         find_syllable(pos, &len, &T, TCount, 2);
1034         pos += len;
1035         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1036             *code = SBase + (L*VCount+V)*TCount + T;
1037             return 1;
1038         }
1039         /* Otherwise, it's an illegal syllable name. */
1040         return 0;
1041     }
1042 
1043     /* Check for unified ideographs. */
1044     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1045         /* Four or five hexdigits must follow. */
1046         v = 0;
1047         name += 22;
1048         namelen -= 22;
1049         if (namelen != 4 && namelen != 5)
1050             return 0;
1051         while (namelen--) {
1052             v *= 16;
1053             if (*name >= '0' && *name <= '9')
1054                 v += *name - '0';
1055             else if (*name >= 'A' && *name <= 'F')
1056                 v += *name - 'A' + 10;
1057             else
1058                 return 0;
1059             name++;
1060         }
1061         if (!is_unified_ideograph(v))
1062             return 0;
1063         *code = v;
1064         return 1;
1065     }
1066 
1067     /* the following is the same as python's dictionary lookup, with
1068        only minor changes.  see the makeunicodedata script for more
1069        details */
1070 
1071     h = (unsigned int) _gethash(name, namelen, code_magic);
1072     i = (~h) & mask;
1073     v = code_hash[i];
1074     if (!v)
1075         return 0;
1076     if (_cmpname(self, v, name, namelen)) {
1077         *code = v;
1078         return 1;
1079     }
1080     incr = (h ^ (h >> 3)) & mask;
1081     if (!incr)
1082         incr = mask;
1083     for (;;) {
1084         i = (i + incr) & mask;
1085         v = code_hash[i];
1086         if (!v)
1087             return 0;
1088         if (_cmpname(self, v, name, namelen)) {
1089             *code = v;
1090             return 1;
1091         }
1092         incr = incr << 1;
1093         if (incr > mask)
1094             incr = incr ^ code_poly;
1095     }
1096 }
1097 
1098 static const _PyUnicode_Name_CAPI hashAPI =
1099 {
1100     sizeof(_PyUnicode_Name_CAPI),
1101     _getucname,
1102     _getcode
1103 };
1104 
1105 /* -------------------------------------------------------------------- */
1106 /* Python bindings */
1107 
1108 PyDoc_STRVAR(unicodedata_name__doc__,
1109 "name(unichr[, default])\n\
1110 Returns the name assigned to the Unicode character unichr as a\n\
1111 string. If no name is defined, default is returned, or, if not\n\
1112 given, ValueError is raised.");
1113 
1114 static PyObject *
unicodedata_name(PyObject * self,PyObject * args)1115 unicodedata_name(PyObject* self, PyObject* args)
1116 {
1117     char name[NAME_MAXLEN];
1118     Py_UCS4 c;
1119 
1120     PyUnicodeObject* v;
1121     PyObject* defobj = NULL;
1122     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1123         return NULL;
1124 
1125     c = getuchar(v);
1126     if (c == (Py_UCS4)-1)
1127         return NULL;
1128 
1129     if (!_getucname(self, c, name, sizeof(name))) {
1130         if (defobj == NULL) {
1131             PyErr_SetString(PyExc_ValueError, "no such name");
1132             return NULL;
1133         }
1134         else {
1135             Py_INCREF(defobj);
1136             return defobj;
1137         }
1138     }
1139 
1140     return Py_BuildValue("s", name);
1141 }
1142 
1143 PyDoc_STRVAR(unicodedata_lookup__doc__,
1144 "lookup(name)\n\
1145 \n\
1146 Look up character by name.  If a character with the\n\
1147 given name is found, return the corresponding Unicode\n\
1148 character.  If not found, KeyError is raised.");
1149 
1150 static PyObject *
unicodedata_lookup(PyObject * self,PyObject * args)1151 unicodedata_lookup(PyObject* self, PyObject* args)
1152 {
1153     Py_UCS4 code;
1154     Py_UNICODE str[2];
1155 
1156     char* name;
1157     int namelen;
1158     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1159         return NULL;
1160 
1161     if (!_getcode(self, name, namelen, &code)) {
1162         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1163                      name);
1164         return NULL;
1165     }
1166 
1167 #ifndef Py_UNICODE_WIDE
1168     if (code >= 0x10000) {
1169         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1170         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1171         return PyUnicode_FromUnicode(str, 2);
1172     }
1173 #endif
1174     str[0] = (Py_UNICODE) code;
1175     return PyUnicode_FromUnicode(str, 1);
1176 }
1177 
1178 /* XXX Add doc strings. */
1179 
1180 static PyMethodDef unicodedata_functions[] = {
1181     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1182     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1183     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1184     {"category", unicodedata_category, METH_VARARGS,
1185                  unicodedata_category__doc__},
1186     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1187                       unicodedata_bidirectional__doc__},
1188     {"combining", unicodedata_combining, METH_VARARGS,
1189                   unicodedata_combining__doc__},
1190     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1191                  unicodedata_mirrored__doc__},
1192     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1193                          unicodedata_east_asian_width__doc__},
1194     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1195                       unicodedata_decomposition__doc__},
1196     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1197     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1198     {"normalize", unicodedata_normalize, METH_VARARGS,
1199                   unicodedata_normalize__doc__},
1200     {NULL, NULL}                /* sentinel */
1201 };
1202 
1203 static PyTypeObject UCD_Type = {
1204         /* The ob_type field must be initialized in the module init function
1205          * to be portable to Windows without using C++. */
1206         PyVarObject_HEAD_INIT(NULL, 0)
1207         "unicodedata.UCD",              /*tp_name*/
1208         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1209         0,                      /*tp_itemsize*/
1210         /* methods */
1211         (destructor)PyObject_Del, /*tp_dealloc*/
1212         0,                      /*tp_print*/
1213         0,                      /*tp_getattr*/
1214         0,                      /*tp_setattr*/
1215         0,                      /*tp_compare*/
1216         0,                      /*tp_repr*/
1217         0,                      /*tp_as_number*/
1218         0,                      /*tp_as_sequence*/
1219         0,                      /*tp_as_mapping*/
1220         0,                      /*tp_hash*/
1221         0,                      /*tp_call*/
1222         0,                      /*tp_str*/
1223         PyObject_GenericGetAttr,/*tp_getattro*/
1224         0,                      /*tp_setattro*/
1225         0,                      /*tp_as_buffer*/
1226         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1227         0,                      /*tp_doc*/
1228         0,                      /*tp_traverse*/
1229         0,                      /*tp_clear*/
1230         0,                      /*tp_richcompare*/
1231         0,                      /*tp_weaklistoffset*/
1232         0,                      /*tp_iter*/
1233         0,                      /*tp_iternext*/
1234         unicodedata_functions,  /*tp_methods*/
1235         DB_members,             /*tp_members*/
1236         0,                      /*tp_getset*/
1237         0,                      /*tp_base*/
1238         0,                      /*tp_dict*/
1239         0,                      /*tp_descr_get*/
1240         0,                      /*tp_descr_set*/
1241         0,                      /*tp_dictoffset*/
1242         0,                      /*tp_init*/
1243         0,                      /*tp_alloc*/
1244         0,                      /*tp_new*/
1245         0,                      /*tp_free*/
1246         0,                      /*tp_is_gc*/
1247 };
1248 
1249 PyDoc_STRVAR(unicodedata_docstring,
1250 "This module provides access to the Unicode Character Database which\n\
1251 defines character properties for all Unicode characters. The data in\n\
1252 this database is based on the UnicodeData.txt file version\n\
1253 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1254 \n\
1255 The module uses the same names and symbols as defined by the\n\
1256 UnicodeData File Format 5.2.0 (see\n\
1257 http://www.unicode.org/reports/tr44/tr44-4.html).");
1258 
1259 PyMODINIT_FUNC
initunicodedata(void)1260 initunicodedata(void)
1261 {
1262     PyObject *m, *v;
1263 
1264     Py_TYPE(&UCD_Type) = &PyType_Type;
1265 
1266     m = Py_InitModule3(
1267         "unicodedata", unicodedata_functions, unicodedata_docstring);
1268     if (!m)
1269         return;
1270 
1271     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1272     Py_INCREF(&UCD_Type);
1273     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1274 
1275     /* Previous versions */
1276     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1277     if (v != NULL)
1278         PyModule_AddObject(m, "ucd_3_2_0", v);
1279 
1280     /* Export C API */
1281     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1282     if (v != NULL)
1283         PyModule_AddObject(m, "ucnhash_CAPI", v);
1284 }
1285 
1286 /*
1287 Local variables:
1288 c-basic-offset: 4
1289 indent-tabs-mode: nil
1290 End:
1291 */
1292