1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode 5.2 data base.
4
5 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
6
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. L�wis (martin@v.loewis.de)
10
11 Copyright (c) Corporation for National Research Initiatives.
12
13 ------------------------------------------------------------------------ */
14
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
18
19 /* character properties */
20
21 typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 const unsigned char normalization_quick_check; /* see is_normalized() */
31 } _PyUnicode_DatabaseRecord;
32
33 typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
38 const unsigned char mirrored_changed;
39 const double numeric_changed;
40 } change_record;
41
42 /* data file generated by Tools/unicode/makeunicodedata.py */
43 #include "unicodedata_db.h"
44
45 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)46 _getrecord_ex(Py_UCS4 code)
47 {
48 int index;
49 if (code >= 0x110000)
50 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57 }
58
59 /* ------------- Previous-version API ------------------------------------- */
60 typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65 } PreviousDBVersion;
66
67 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
69 static PyMemberDef DB_members[] = {
70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
71 {NULL}
72 };
73
74 /* forward declaration */
75 static PyTypeObject UCD_Type;
76
77 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79 Py_UCS4 (*normalization)(Py_UCS4))
80 {
81 PreviousDBVersion *self;
82 self = PyObject_New(PreviousDBVersion, &UCD_Type);
83 if (self == NULL)
84 return NULL;
85 self->name = name;
86 self->getrecord = getrecord;
87 self->normalization = normalization;
88 return (PyObject*)self;
89 }
90
91
getuchar(PyUnicodeObject * obj)92 static Py_UCS4 getuchar(PyUnicodeObject *obj)
93 {
94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95
96 if (PyUnicode_GET_SIZE(obj) == 1)
97 return *v;
98 #ifndef Py_UNICODE_WIDE
99 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
103 #endif
104 PyErr_SetString(PyExc_TypeError,
105 "need a single Unicode character as parameter");
106 return (Py_UCS4)-1;
107 }
108
109 /* --- Module API --------------------------------------------------------- */
110
111 PyDoc_STRVAR(unicodedata_decimal__doc__,
112 "decimal(unichr[, default])\n\
113 \n\
114 Returns the decimal value assigned to the Unicode character unichr\n\
115 as integer. If no such value is defined, default is returned, or, if\n\
116 not given, ValueError is raised.");
117
118 static PyObject *
unicodedata_decimal(PyObject * self,PyObject * args)119 unicodedata_decimal(PyObject *self, PyObject *args)
120 {
121 PyUnicodeObject *v;
122 PyObject *defobj = NULL;
123 int have_old = 0;
124 long rc;
125 Py_UCS4 c;
126
127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
128 return NULL;
129 c = getuchar(v);
130 if (c == (Py_UCS4)-1)
131 return NULL;
132
133 if (self) {
134 const change_record *old = get_old_record(self, c);
135 if (old->category_changed == 0) {
136 /* unassigned */
137 have_old = 1;
138 rc = -1;
139 }
140 else if (old->decimal_changed != 0xFF) {
141 have_old = 1;
142 rc = old->decimal_changed;
143 }
144 }
145
146 if (!have_old)
147 rc = Py_UNICODE_TODECIMAL(c);
148 if (rc < 0) {
149 if (defobj == NULL) {
150 PyErr_SetString(PyExc_ValueError,
151 "not a decimal");
152 return NULL;
153 }
154 else {
155 Py_INCREF(defobj);
156 return defobj;
157 }
158 }
159 return PyInt_FromLong(rc);
160 }
161
162 PyDoc_STRVAR(unicodedata_digit__doc__,
163 "digit(unichr[, default])\n\
164 \n\
165 Returns the digit value assigned to the Unicode character unichr as\n\
166 integer. If no such value is defined, default is returned, or, if\n\
167 not given, ValueError is raised.");
168
169 static PyObject *
unicodedata_digit(PyObject * self,PyObject * args)170 unicodedata_digit(PyObject *self, PyObject *args)
171 {
172 PyUnicodeObject *v;
173 PyObject *defobj = NULL;
174 long rc;
175 Py_UCS4 c;
176
177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
178 return NULL;
179 c = getuchar(v);
180 if (c == (Py_UCS4)-1)
181 return NULL;
182 rc = Py_UNICODE_TODIGIT(c);
183 if (rc < 0) {
184 if (defobj == NULL) {
185 PyErr_SetString(PyExc_ValueError, "not a digit");
186 return NULL;
187 }
188 else {
189 Py_INCREF(defobj);
190 return defobj;
191 }
192 }
193 return PyInt_FromLong(rc);
194 }
195
196 PyDoc_STRVAR(unicodedata_numeric__doc__,
197 "numeric(unichr[, default])\n\
198 \n\
199 Returns the numeric value assigned to the Unicode character unichr\n\
200 as float. If no such value is defined, default is returned, or, if\n\
201 not given, ValueError is raised.");
202
203 static PyObject *
unicodedata_numeric(PyObject * self,PyObject * args)204 unicodedata_numeric(PyObject *self, PyObject *args)
205 {
206 PyUnicodeObject *v;
207 PyObject *defobj = NULL;
208 int have_old = 0;
209 double rc;
210 Py_UCS4 c;
211
212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
213 return NULL;
214 c = getuchar(v);
215 if (c == (Py_UCS4)-1)
216 return NULL;
217
218 if (self) {
219 const change_record *old = get_old_record(self, c);
220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
223 rc = -1.0;
224 }
225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
232 rc = Py_UNICODE_TONUMERIC(c);
233 if (rc == -1.0) {
234 if (defobj == NULL) {
235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
236 return NULL;
237 }
238 else {
239 Py_INCREF(defobj);
240 return defobj;
241 }
242 }
243 return PyFloat_FromDouble(rc);
244 }
245
246 PyDoc_STRVAR(unicodedata_category__doc__,
247 "category(unichr)\n\
248 \n\
249 Returns the general category assigned to the Unicode character\n\
250 unichr as string.");
251
252 static PyObject *
unicodedata_category(PyObject * self,PyObject * args)253 unicodedata_category(PyObject *self, PyObject *args)
254 {
255 PyUnicodeObject *v;
256 int index;
257 Py_UCS4 c;
258
259 if (!PyArg_ParseTuple(args, "O!:category",
260 &PyUnicode_Type, &v))
261 return NULL;
262 c = getuchar(v);
263 if (c == (Py_UCS4)-1)
264 return NULL;
265 index = (int) _getrecord_ex(c)->category;
266 if (self) {
267 const change_record *old = get_old_record(self, c);
268 if (old->category_changed != 0xFF)
269 index = old->category_changed;
270 }
271 return PyString_FromString(_PyUnicode_CategoryNames[index]);
272 }
273
274 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275 "bidirectional(unichr)\n\
276 \n\
277 Returns the bidirectional class assigned to the Unicode character\n\
278 unichr as string. If no such value is defined, an empty string is\n\
279 returned.");
280
281 static PyObject *
unicodedata_bidirectional(PyObject * self,PyObject * args)282 unicodedata_bidirectional(PyObject *self, PyObject *args)
283 {
284 PyUnicodeObject *v;
285 int index;
286 Py_UCS4 c;
287
288 if (!PyArg_ParseTuple(args, "O!:bidirectional",
289 &PyUnicode_Type, &v))
290 return NULL;
291 c = getuchar(v);
292 if (c == (Py_UCS4)-1)
293 return NULL;
294 index = (int) _getrecord_ex(c)->bidirectional;
295 if (self) {
296 const change_record *old = get_old_record(self, c);
297 if (old->category_changed == 0)
298 index = 0; /* unassigned */
299 else if (old->bidir_changed != 0xFF)
300 index = old->bidir_changed;
301 }
302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
303 }
304
305 PyDoc_STRVAR(unicodedata_combining__doc__,
306 "combining(unichr)\n\
307 \n\
308 Returns the canonical combining class assigned to the Unicode\n\
309 character unichr as integer. Returns 0 if no combining class is\n\
310 defined.");
311
312 static PyObject *
unicodedata_combining(PyObject * self,PyObject * args)313 unicodedata_combining(PyObject *self, PyObject *args)
314 {
315 PyUnicodeObject *v;
316 int index;
317 Py_UCS4 c;
318
319 if (!PyArg_ParseTuple(args, "O!:combining",
320 &PyUnicode_Type, &v))
321 return NULL;
322 c = getuchar(v);
323 if (c == (Py_UCS4)-1)
324 return NULL;
325 index = (int) _getrecord_ex(c)->combining;
326 if (self) {
327 const change_record *old = get_old_record(self, c);
328 if (old->category_changed == 0)
329 index = 0; /* unassigned */
330 }
331 return PyInt_FromLong(index);
332 }
333
334 PyDoc_STRVAR(unicodedata_mirrored__doc__,
335 "mirrored(unichr)\n\
336 \n\
337 Returns the mirrored property assigned to the Unicode character\n\
338 unichr as integer. Returns 1 if the character has been identified as\n\
339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
340
341 static PyObject *
unicodedata_mirrored(PyObject * self,PyObject * args)342 unicodedata_mirrored(PyObject *self, PyObject *args)
343 {
344 PyUnicodeObject *v;
345 int index;
346 Py_UCS4 c;
347
348 if (!PyArg_ParseTuple(args, "O!:mirrored",
349 &PyUnicode_Type, &v))
350 return NULL;
351 c = getuchar(v);
352 if (c == (Py_UCS4)-1)
353 return NULL;
354 index = (int) _getrecord_ex(c)->mirrored;
355 if (self) {
356 const change_record *old = get_old_record(self, c);
357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
361 }
362 return PyInt_FromLong(index);
363 }
364
365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366 "east_asian_width(unichr)\n\
367 \n\
368 Returns the east asian width assigned to the Unicode character\n\
369 unichr as string.");
370
371 static PyObject *
unicodedata_east_asian_width(PyObject * self,PyObject * args)372 unicodedata_east_asian_width(PyObject *self, PyObject *args)
373 {
374 PyUnicodeObject *v;
375 int index;
376 Py_UCS4 c;
377
378 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
379 &PyUnicode_Type, &v))
380 return NULL;
381 c = getuchar(v);
382 if (c == (Py_UCS4)-1)
383 return NULL;
384 index = (int) _getrecord_ex(c)->east_asian_width;
385 if (self) {
386 const change_record *old = get_old_record(self, c);
387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
389 }
390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
391 }
392
393 PyDoc_STRVAR(unicodedata_decomposition__doc__,
394 "decomposition(unichr)\n\
395 \n\
396 Returns the character decomposition mapping assigned to the Unicode\n\
397 character unichr as string. An empty string is returned in case no\n\
398 such mapping is defined.");
399
400 static PyObject *
unicodedata_decomposition(PyObject * self,PyObject * args)401 unicodedata_decomposition(PyObject *self, PyObject *args)
402 {
403 PyUnicodeObject *v;
404 char decomp[256];
405 int code, index, count, i;
406 unsigned int prefix_index;
407 Py_UCS4 c;
408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
410 &PyUnicode_Type, &v))
411 return NULL;
412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
415
416 code = (int)c;
417
418 if (self) {
419 const change_record *old = get_old_record(self, c);
420 if (old->category_changed == 0)
421 return PyString_FromString(""); /* unassigned */
422 }
423
424 if (code < 0 || code >= 0x110000)
425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
432 /* high byte is number of hex bytes (usually one or two), low byte
433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444
445 /* copy prefix */
446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
452 assert((size_t)i < sizeof(decomp));
453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
455 i += strlen(decomp + i);
456 }
457
458 decomp[i] = '\0';
459
460 return PyString_FromString(decomp);
461 }
462
463 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
465 {
466 if (code >= 0x110000) {
467 *index = 0;
468 } else if (self && get_old_record(self, code)->category_changed==0) {
469 /* unassigned in old version */
470 *index = 0;
471 }
472 else {
473 *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
477
478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count = decomp_data[*index] >> 8;
481 *prefix = decomp_data[*index] & 255;
482
483 (*index)++;
484 }
485
486 #define SBase 0xAC00
487 #define LBase 0x1100
488 #define VBase 0x1161
489 #define TBase 0x11A7
490 #define LCount 19
491 #define VCount 21
492 #define TCount 28
493 #define NCount (VCount*TCount)
494 #define SCount (LCount*NCount)
495
496 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)497 nfd_nfkd(PyObject *self, PyObject *input, int k)
498 {
499 PyObject *result;
500 Py_UNICODE *i, *end, *o;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
502 Py_UNICODE stack[20];
503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
505 unsigned char prev, cur;
506
507 stackptr = 0;
508 isize = PyUnicode_GET_SIZE(input);
509 space = isize;
510 /* Overallocate at most 10 characters. */
511 if (space > 10) {
512 if (space <= PY_SSIZE_T_MAX - 10)
513 space += 10;
514 }
515 else {
516 space *= 2;
517 }
518 result = PyUnicode_FromUnicode(NULL, space);
519 if (!result)
520 return NULL;
521 i = PyUnicode_AS_UNICODE(input);
522 end = i + isize;
523 o = PyUnicode_AS_UNICODE(result);
524
525 while (i < end) {
526 stack[stackptr++] = *i++;
527 while(stackptr) {
528 Py_UNICODE code = stack[--stackptr];
529 /* Hangul Decomposition adds three characters in
530 a single step, so we need at least that much room. */
531 if (space < 3) {
532 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
533 space += 10;
534 if (PyUnicode_Resize(&result, newsize) == -1)
535 return NULL;
536 o = PyUnicode_AS_UNICODE(result) + newsize - space;
537 }
538 /* Hangul Decomposition. */
539 if (SBase <= code && code < (SBase+SCount)) {
540 int SIndex = code - SBase;
541 int L = LBase + SIndex / NCount;
542 int V = VBase + (SIndex % NCount) / TCount;
543 int T = TBase + SIndex % TCount;
544 *o++ = L;
545 *o++ = V;
546 space -= 2;
547 if (T != TBase) {
548 *o++ = T;
549 space --;
550 }
551 continue;
552 }
553 /* normalization changes */
554 if (self) {
555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556 if (value != 0) {
557 stack[stackptr++] = value;
558 continue;
559 }
560 }
561
562 /* Other decompositions. */
563 get_decomp_record(self, code, &index, &prefix, &count);
564
565 /* Copy character if it is not decomposable, or has a
566 compatibility decomposition, but we do NFD. */
567 if (!count || (prefix && !k)) {
568 *o++ = code;
569 space--;
570 continue;
571 }
572 /* Copy decomposition onto the stack, in reverse
573 order. */
574 while(count) {
575 code = decomp_data[index + (--count)];
576 stack[stackptr++] = code;
577 }
578 }
579 }
580
581 /* Drop overallocation. Cannot fail. */
582 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
583
584 /* Sort canonically. */
585 i = PyUnicode_AS_UNICODE(result);
586 prev = _getrecord_ex(*i)->combining;
587 end = i + PyUnicode_GET_SIZE(result);
588 for (i++; i < end; i++) {
589 cur = _getrecord_ex(*i)->combining;
590 if (prev == 0 || cur == 0 || prev <= cur) {
591 prev = cur;
592 continue;
593 }
594 /* Non-canonical order. Need to switch *i with previous. */
595 o = i - 1;
596 while (1) {
597 Py_UNICODE tmp = o[1];
598 o[1] = o[0];
599 o[0] = tmp;
600 o--;
601 if (o < PyUnicode_AS_UNICODE(result))
602 break;
603 prev = _getrecord_ex(*o)->combining;
604 if (prev == 0 || prev <= cur)
605 break;
606 }
607 prev = _getrecord_ex(*i)->combining;
608 }
609 return result;
610 }
611
612 static int
find_nfc_index(PyObject * self,struct reindex * nfc,Py_UNICODE code)613 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
614 {
615 int index;
616 for (index = 0; nfc[index].start; index++) {
617 int start = nfc[index].start;
618 if (code < start)
619 return -1;
620 if (code <= start + nfc[index].count) {
621 int delta = code - start;
622 return nfc[index].index + delta;
623 }
624 }
625 return -1;
626 }
627
628 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)629 nfc_nfkc(PyObject *self, PyObject *input, int k)
630 {
631 PyObject *result;
632 Py_UNICODE *i, *i1, *o, *end;
633 int f,l,index,index1,comb;
634 Py_UNICODE code;
635 Py_UNICODE *skipped[20];
636 int cskipped = 0;
637
638 result = nfd_nfkd(self, input, k);
639 if (!result)
640 return NULL;
641
642 /* We are going to modify result in-place.
643 If nfd_nfkd is changed to sometimes return the input,
644 this code needs to be reviewed. */
645 assert(result != input);
646
647 i = PyUnicode_AS_UNICODE(result);
648 end = i + PyUnicode_GET_SIZE(result);
649 o = PyUnicode_AS_UNICODE(result);
650
651 again:
652 while (i < end) {
653 for (index = 0; index < cskipped; index++) {
654 if (skipped[index] == i) {
655 /* *i character is skipped.
656 Remove from list. */
657 skipped[index] = skipped[cskipped-1];
658 cskipped--;
659 i++;
660 goto again; /* continue while */
661 }
662 }
663 /* Hangul Composition. We don't need to check for <LV,T>
664 pairs, since we always have decomposed data. */
665 if (LBase <= *i && *i < (LBase+LCount) &&
666 i + 1 < end &&
667 VBase <= i[1] && i[1] < (VBase+VCount)) {
668 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
669 and V character is a modern vowel (0x1161 ~ 0x1175). */
670 int LIndex, VIndex;
671 LIndex = i[0] - LBase;
672 VIndex = i[1] - VBase;
673 code = SBase + (LIndex*VCount+VIndex)*TCount;
674 i+=2;
675 if (i < end &&
676 TBase < *i && *i < (TBase+TCount)) {
677 /* check T character is a modern trailing consonant
678 (0x11A8 ~ 0x11C2). */
679 code += *i-TBase;
680 i++;
681 }
682 *o++ = code;
683 continue;
684 }
685
686 f = find_nfc_index(self, nfc_first, *i);
687 if (f == -1) {
688 *o++ = *i++;
689 continue;
690 }
691 /* Find next unblocked character. */
692 i1 = i+1;
693 comb = 0;
694 while (i1 < end) {
695 int comb1 = _getrecord_ex(*i1)->combining;
696 if (comb) {
697 if (comb1 == 0)
698 break;
699 if (comb >= comb1) {
700 /* Character is blocked. */
701 i1++;
702 continue;
703 }
704 }
705 l = find_nfc_index(self, nfc_last, *i1);
706 /* *i1 cannot be combined with *i. If *i1
707 is a starter, we don't need to look further.
708 Otherwise, record the combining class. */
709 if (l == -1) {
710 not_combinable:
711 if (comb1 == 0)
712 break;
713 comb = comb1;
714 i1++;
715 continue;
716 }
717 index = f*TOTAL_LAST + l;
718 index1 = comp_index[index >> COMP_SHIFT];
719 code = comp_data[(index1<<COMP_SHIFT)+
720 (index&((1<<COMP_SHIFT)-1))];
721 if (code == 0)
722 goto not_combinable;
723
724 /* Replace the original character. */
725 *i = code;
726 /* Mark the second character unused. */
727 assert(cskipped < 20);
728 skipped[cskipped++] = i1;
729 i1++;
730 f = find_nfc_index(self, nfc_first, *i);
731 if (f == -1)
732 break;
733 }
734 *o++ = *i++;
735 }
736 if (o != end)
737 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
738 return result;
739 }
740
741 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
742 static int
is_normalized(PyObject * self,PyObject * input,int nfc,int k)743 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
744 {
745 Py_UNICODE *i, *end;
746 unsigned char prev_combining = 0, quickcheck_mask;
747
748 /* An older version of the database is requested, quickchecks must be
749 disabled. */
750 if (self != NULL)
751 return 0;
752
753 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
754 as described in http://unicode.org/reports/tr15/#Annex8. */
755 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
756
757 i = PyUnicode_AS_UNICODE(input);
758 end = i + PyUnicode_GET_SIZE(input);
759 while (i < end) {
760 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
761 unsigned char combining = record->combining;
762 unsigned char quickcheck = record->normalization_quick_check;
763
764 if (quickcheck & quickcheck_mask)
765 return 0; /* this string might need normalization */
766 if (combining && prev_combining > combining)
767 return 0; /* non-canonical sort order, not normalized */
768 prev_combining = combining;
769 }
770 return 1; /* certainly normalized */
771 }
772
773 PyDoc_STRVAR(unicodedata_normalize__doc__,
774 "normalize(form, unistr)\n\
775 \n\
776 Return the normal form 'form' for the Unicode string unistr. Valid\n\
777 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
778
779 static PyObject*
unicodedata_normalize(PyObject * self,PyObject * args)780 unicodedata_normalize(PyObject *self, PyObject *args)
781 {
782 char *form;
783 PyObject *input;
784
785 if(!PyArg_ParseTuple(args, "sO!:normalize",
786 &form, &PyUnicode_Type, &input))
787 return NULL;
788
789 if (PyUnicode_GetSize(input) == 0) {
790 /* Special case empty input strings, since resizing
791 them later would cause internal errors. */
792 Py_INCREF(input);
793 return input;
794 }
795
796 if (strcmp(form, "NFC") == 0) {
797 if (is_normalized(self, input, 1, 0)) {
798 Py_INCREF(input);
799 return input;
800 }
801 return nfc_nfkc(self, input, 0);
802 }
803 if (strcmp(form, "NFKC") == 0) {
804 if (is_normalized(self, input, 1, 1)) {
805 Py_INCREF(input);
806 return input;
807 }
808 return nfc_nfkc(self, input, 1);
809 }
810 if (strcmp(form, "NFD") == 0) {
811 if (is_normalized(self, input, 0, 0)) {
812 Py_INCREF(input);
813 return input;
814 }
815 return nfd_nfkd(self, input, 0);
816 }
817 if (strcmp(form, "NFKD") == 0) {
818 if (is_normalized(self, input, 0, 1)) {
819 Py_INCREF(input);
820 return input;
821 }
822 return nfd_nfkd(self, input, 1);
823 }
824 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
825 return NULL;
826 }
827
828 /* -------------------------------------------------------------------- */
829 /* unicode character name tables */
830
831 /* data file generated by Tools/unicode/makeunicodedata.py */
832 #include "unicodename_db.h"
833
834 /* -------------------------------------------------------------------- */
835 /* database code (cut and pasted from the unidb package) */
836
837 static unsigned long
_gethash(const char * s,int len,int scale)838 _gethash(const char *s, int len, int scale)
839 {
840 int i;
841 unsigned long h = 0;
842 unsigned long ix;
843 for (i = 0; i < len; i++) {
844 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
845 ix = h & 0xff000000;
846 if (ix)
847 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
848 }
849 return h;
850 }
851
852 static char *hangul_syllables[][3] = {
853 { "G", "A", "" },
854 { "GG", "AE", "G" },
855 { "N", "YA", "GG" },
856 { "D", "YAE", "GS" },
857 { "DD", "EO", "N", },
858 { "R", "E", "NJ" },
859 { "M", "YEO", "NH" },
860 { "B", "YE", "D" },
861 { "BB", "O", "L" },
862 { "S", "WA", "LG" },
863 { "SS", "WAE", "LM" },
864 { "", "OE", "LB" },
865 { "J", "YO", "LS" },
866 { "JJ", "U", "LT" },
867 { "C", "WEO", "LP" },
868 { "K", "WE", "LH" },
869 { "T", "WI", "M" },
870 { "P", "YU", "B" },
871 { "H", "EU", "BS" },
872 { 0, "YI", "S" },
873 { 0, "I", "SS" },
874 { 0, 0, "NG" },
875 { 0, 0, "J" },
876 { 0, 0, "C" },
877 { 0, 0, "K" },
878 { 0, 0, "T" },
879 { 0, 0, "P" },
880 { 0, 0, "H" }
881 };
882
883 static int
is_unified_ideograph(Py_UCS4 code)884 is_unified_ideograph(Py_UCS4 code)
885 {
886 return (
887 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
888 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
889 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
890 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */
891 }
892
893 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen)894 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
895 {
896 int offset;
897 int i;
898 int word;
899 unsigned char* w;
900
901 if (code >= 0x110000)
902 return 0;
903
904 if (self) {
905 const change_record *old = get_old_record(self, code);
906 if (old->category_changed == 0) {
907 /* unassigned */
908 return 0;
909 }
910 }
911
912 if (SBase <= code && code < SBase+SCount) {
913 /* Hangul syllable. */
914 int SIndex = code - SBase;
915 int L = SIndex / NCount;
916 int V = (SIndex % NCount) / TCount;
917 int T = SIndex % TCount;
918
919 if (buflen < 27)
920 /* Worst case: HANGUL SYLLABLE <10chars>. */
921 return 0;
922 strcpy(buffer, "HANGUL SYLLABLE ");
923 buffer += 16;
924 strcpy(buffer, hangul_syllables[L][0]);
925 buffer += strlen(hangul_syllables[L][0]);
926 strcpy(buffer, hangul_syllables[V][1]);
927 buffer += strlen(hangul_syllables[V][1]);
928 strcpy(buffer, hangul_syllables[T][2]);
929 buffer += strlen(hangul_syllables[T][2]);
930 *buffer = '\0';
931 return 1;
932 }
933
934 if (is_unified_ideograph(code)) {
935 if (buflen < 28)
936 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
937 return 0;
938 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
939 return 1;
940 }
941
942 /* get offset into phrasebook */
943 offset = phrasebook_offset1[(code>>phrasebook_shift)];
944 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
945 (code&((1<<phrasebook_shift)-1))];
946 if (!offset)
947 return 0;
948
949 i = 0;
950
951 for (;;) {
952 /* get word index */
953 word = phrasebook[offset] - phrasebook_short;
954 if (word >= 0) {
955 word = (word << 8) + phrasebook[offset+1];
956 offset += 2;
957 } else
958 word = phrasebook[offset++];
959 if (i) {
960 if (i > buflen)
961 return 0; /* buffer overflow */
962 buffer[i++] = ' ';
963 }
964 /* copy word string from lexicon. the last character in the
965 word has bit 7 set. the last word in a string ends with
966 0x80 */
967 w = lexicon + lexicon_offset[word];
968 while (*w < 128) {
969 if (i >= buflen)
970 return 0; /* buffer overflow */
971 buffer[i++] = *w++;
972 }
973 if (i >= buflen)
974 return 0; /* buffer overflow */
975 buffer[i++] = *w & 127;
976 if (*w == 128)
977 break; /* end of word */
978 }
979
980 return 1;
981 }
982
983 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)984 _cmpname(PyObject *self, int code, const char* name, int namelen)
985 {
986 /* check if code corresponds to the given name */
987 int i;
988 char buffer[NAME_MAXLEN];
989 if (!_getucname(self, code, buffer, sizeof(buffer)))
990 return 0;
991 for (i = 0; i < namelen; i++) {
992 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
993 return 0;
994 }
995 return buffer[namelen] == '\0';
996 }
997
998 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)999 find_syllable(const char *str, int *len, int *pos, int count, int column)
1000 {
1001 int i, len1;
1002 *len = -1;
1003 for (i = 0; i < count; i++) {
1004 char *s = hangul_syllables[i][column];
1005 len1 = strlen(s);
1006 if (len1 <= *len)
1007 continue;
1008 if (strncmp(str, s, len1) == 0) {
1009 *len = len1;
1010 *pos = i;
1011 }
1012 }
1013 if (*len == -1) {
1014 *len = 0;
1015 }
1016 }
1017
1018 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code)1019 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1020 {
1021 unsigned int h, v;
1022 unsigned int mask = code_size-1;
1023 unsigned int i, incr;
1024
1025 /* Check for hangul syllables. */
1026 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1027 int len, L = -1, V = -1, T = -1;
1028 const char *pos = name + 16;
1029 find_syllable(pos, &len, &L, LCount, 0);
1030 pos += len;
1031 find_syllable(pos, &len, &V, VCount, 1);
1032 pos += len;
1033 find_syllable(pos, &len, &T, TCount, 2);
1034 pos += len;
1035 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1036 *code = SBase + (L*VCount+V)*TCount + T;
1037 return 1;
1038 }
1039 /* Otherwise, it's an illegal syllable name. */
1040 return 0;
1041 }
1042
1043 /* Check for unified ideographs. */
1044 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1045 /* Four or five hexdigits must follow. */
1046 v = 0;
1047 name += 22;
1048 namelen -= 22;
1049 if (namelen != 4 && namelen != 5)
1050 return 0;
1051 while (namelen--) {
1052 v *= 16;
1053 if (*name >= '0' && *name <= '9')
1054 v += *name - '0';
1055 else if (*name >= 'A' && *name <= 'F')
1056 v += *name - 'A' + 10;
1057 else
1058 return 0;
1059 name++;
1060 }
1061 if (!is_unified_ideograph(v))
1062 return 0;
1063 *code = v;
1064 return 1;
1065 }
1066
1067 /* the following is the same as python's dictionary lookup, with
1068 only minor changes. see the makeunicodedata script for more
1069 details */
1070
1071 h = (unsigned int) _gethash(name, namelen, code_magic);
1072 i = (~h) & mask;
1073 v = code_hash[i];
1074 if (!v)
1075 return 0;
1076 if (_cmpname(self, v, name, namelen)) {
1077 *code = v;
1078 return 1;
1079 }
1080 incr = (h ^ (h >> 3)) & mask;
1081 if (!incr)
1082 incr = mask;
1083 for (;;) {
1084 i = (i + incr) & mask;
1085 v = code_hash[i];
1086 if (!v)
1087 return 0;
1088 if (_cmpname(self, v, name, namelen)) {
1089 *code = v;
1090 return 1;
1091 }
1092 incr = incr << 1;
1093 if (incr > mask)
1094 incr = incr ^ code_poly;
1095 }
1096 }
1097
1098 static const _PyUnicode_Name_CAPI hashAPI =
1099 {
1100 sizeof(_PyUnicode_Name_CAPI),
1101 _getucname,
1102 _getcode
1103 };
1104
1105 /* -------------------------------------------------------------------- */
1106 /* Python bindings */
1107
1108 PyDoc_STRVAR(unicodedata_name__doc__,
1109 "name(unichr[, default])\n\
1110 Returns the name assigned to the Unicode character unichr as a\n\
1111 string. If no name is defined, default is returned, or, if not\n\
1112 given, ValueError is raised.");
1113
1114 static PyObject *
unicodedata_name(PyObject * self,PyObject * args)1115 unicodedata_name(PyObject* self, PyObject* args)
1116 {
1117 char name[NAME_MAXLEN];
1118 Py_UCS4 c;
1119
1120 PyUnicodeObject* v;
1121 PyObject* defobj = NULL;
1122 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1123 return NULL;
1124
1125 c = getuchar(v);
1126 if (c == (Py_UCS4)-1)
1127 return NULL;
1128
1129 if (!_getucname(self, c, name, sizeof(name))) {
1130 if (defobj == NULL) {
1131 PyErr_SetString(PyExc_ValueError, "no such name");
1132 return NULL;
1133 }
1134 else {
1135 Py_INCREF(defobj);
1136 return defobj;
1137 }
1138 }
1139
1140 return Py_BuildValue("s", name);
1141 }
1142
1143 PyDoc_STRVAR(unicodedata_lookup__doc__,
1144 "lookup(name)\n\
1145 \n\
1146 Look up character by name. If a character with the\n\
1147 given name is found, return the corresponding Unicode\n\
1148 character. If not found, KeyError is raised.");
1149
1150 static PyObject *
unicodedata_lookup(PyObject * self,PyObject * args)1151 unicodedata_lookup(PyObject* self, PyObject* args)
1152 {
1153 Py_UCS4 code;
1154 Py_UNICODE str[2];
1155
1156 char* name;
1157 int namelen;
1158 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1159 return NULL;
1160
1161 if (!_getcode(self, name, namelen, &code)) {
1162 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1163 name);
1164 return NULL;
1165 }
1166
1167 #ifndef Py_UNICODE_WIDE
1168 if (code >= 0x10000) {
1169 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1170 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1171 return PyUnicode_FromUnicode(str, 2);
1172 }
1173 #endif
1174 str[0] = (Py_UNICODE) code;
1175 return PyUnicode_FromUnicode(str, 1);
1176 }
1177
1178 /* XXX Add doc strings. */
1179
1180 static PyMethodDef unicodedata_functions[] = {
1181 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1182 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1183 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1184 {"category", unicodedata_category, METH_VARARGS,
1185 unicodedata_category__doc__},
1186 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1187 unicodedata_bidirectional__doc__},
1188 {"combining", unicodedata_combining, METH_VARARGS,
1189 unicodedata_combining__doc__},
1190 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1191 unicodedata_mirrored__doc__},
1192 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1193 unicodedata_east_asian_width__doc__},
1194 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1195 unicodedata_decomposition__doc__},
1196 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1197 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1198 {"normalize", unicodedata_normalize, METH_VARARGS,
1199 unicodedata_normalize__doc__},
1200 {NULL, NULL} /* sentinel */
1201 };
1202
1203 static PyTypeObject UCD_Type = {
1204 /* The ob_type field must be initialized in the module init function
1205 * to be portable to Windows without using C++. */
1206 PyVarObject_HEAD_INIT(NULL, 0)
1207 "unicodedata.UCD", /*tp_name*/
1208 sizeof(PreviousDBVersion), /*tp_basicsize*/
1209 0, /*tp_itemsize*/
1210 /* methods */
1211 (destructor)PyObject_Del, /*tp_dealloc*/
1212 0, /*tp_print*/
1213 0, /*tp_getattr*/
1214 0, /*tp_setattr*/
1215 0, /*tp_compare*/
1216 0, /*tp_repr*/
1217 0, /*tp_as_number*/
1218 0, /*tp_as_sequence*/
1219 0, /*tp_as_mapping*/
1220 0, /*tp_hash*/
1221 0, /*tp_call*/
1222 0, /*tp_str*/
1223 PyObject_GenericGetAttr,/*tp_getattro*/
1224 0, /*tp_setattro*/
1225 0, /*tp_as_buffer*/
1226 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1227 0, /*tp_doc*/
1228 0, /*tp_traverse*/
1229 0, /*tp_clear*/
1230 0, /*tp_richcompare*/
1231 0, /*tp_weaklistoffset*/
1232 0, /*tp_iter*/
1233 0, /*tp_iternext*/
1234 unicodedata_functions, /*tp_methods*/
1235 DB_members, /*tp_members*/
1236 0, /*tp_getset*/
1237 0, /*tp_base*/
1238 0, /*tp_dict*/
1239 0, /*tp_descr_get*/
1240 0, /*tp_descr_set*/
1241 0, /*tp_dictoffset*/
1242 0, /*tp_init*/
1243 0, /*tp_alloc*/
1244 0, /*tp_new*/
1245 0, /*tp_free*/
1246 0, /*tp_is_gc*/
1247 };
1248
1249 PyDoc_STRVAR(unicodedata_docstring,
1250 "This module provides access to the Unicode Character Database which\n\
1251 defines character properties for all Unicode characters. The data in\n\
1252 this database is based on the UnicodeData.txt file version\n\
1253 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1254 \n\
1255 The module uses the same names and symbols as defined by the\n\
1256 UnicodeData File Format 5.2.0 (see\n\
1257 http://www.unicode.org/reports/tr44/tr44-4.html).");
1258
1259 PyMODINIT_FUNC
initunicodedata(void)1260 initunicodedata(void)
1261 {
1262 PyObject *m, *v;
1263
1264 Py_TYPE(&UCD_Type) = &PyType_Type;
1265
1266 m = Py_InitModule3(
1267 "unicodedata", unicodedata_functions, unicodedata_docstring);
1268 if (!m)
1269 return;
1270
1271 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1272 Py_INCREF(&UCD_Type);
1273 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1274
1275 /* Previous versions */
1276 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1277 if (v != NULL)
1278 PyModule_AddObject(m, "ucd_3_2_0", v);
1279
1280 /* Export C API */
1281 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1282 if (v != NULL)
1283 PyModule_AddObject(m, "ucnhash_CAPI", v);
1284 }
1285
1286 /*
1287 Local variables:
1288 c-basic-offset: 4
1289 indent-tabs-mode: nil
1290 End:
1291 */
1292