1 /*
2  * wchar_t helpers
3  */
4 
5 typedef uint16_t cffi_char16_t;
6 typedef uint32_t cffi_char32_t;
7 
8 
9 #if Py_UNICODE_SIZE == 2
10 
11 /* Before Python 2.7, PyUnicode_FromWideChar is not able to convert
12    wchar_t values greater than 65535 into two-unicode-characters surrogates.
13    But even the Python 2.7 version doesn't detect wchar_t values that are
14    out of range(1114112), and just returns nonsense.
15 
16    From cffi 1.11 we can't use it anyway, because we need a version
17    with char32_t input types.
18 */
19 static PyObject *
_my_PyUnicode_FromChar32(const cffi_char32_t * w,Py_ssize_t size)20 _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
21 {
22     PyObject *unicode;
23     register Py_ssize_t i;
24     Py_ssize_t alloc;
25     const cffi_char32_t *orig_w;
26 
27     alloc = size;
28     orig_w = w;
29     for (i = size; i > 0; i--) {
30         if (*w > 0xFFFF)
31             alloc++;
32         w++;
33     }
34     w = orig_w;
35     unicode = PyUnicode_FromUnicode(NULL, alloc);
36     if (!unicode)
37         return NULL;
38 
39     /* Copy the wchar_t data into the new object */
40     {
41         register Py_UNICODE *u;
42         u = PyUnicode_AS_UNICODE(unicode);
43         for (i = size; i > 0; i--) {
44             if (*w > 0xFFFF) {
45                 cffi_char32_t ordinal;
46                 if (*w > 0x10FFFF) {
47                     PyErr_Format(PyExc_ValueError,
48                                  "char32_t out of range for "
49                                  "conversion to unicode: 0x%x", (int)*w);
50                     Py_DECREF(unicode);
51                     return NULL;
52                 }
53                 ordinal = *w++;
54                 ordinal -= 0x10000;
55                 *u++ = 0xD800 | (ordinal >> 10);
56                 *u++ = 0xDC00 | (ordinal & 0x3FF);
57             }
58             else
59                 *u++ = *w++;
60         }
61     }
62     return unicode;
63 }
64 
65 static PyObject *
_my_PyUnicode_FromChar16(const cffi_char16_t * w,Py_ssize_t size)66 _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
67 {
68     return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
69 }
70 
71 #else   /* Py_UNICODE_SIZE == 4 */
72 
73 static PyObject *
_my_PyUnicode_FromChar32(const cffi_char32_t * w,Py_ssize_t size)74 _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
75 {
76     return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
77 }
78 
79 static PyObject *
_my_PyUnicode_FromChar16(const cffi_char16_t * w,Py_ssize_t size)80 _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
81 {
82     /* 'size' is the length of the 'w' array */
83     PyObject *result = PyUnicode_FromUnicode(NULL, size);
84 
85     if (result != NULL) {
86         Py_UNICODE *u_base = PyUnicode_AS_UNICODE(result);
87         Py_UNICODE *u = u_base;
88 
89         if (size == 1) {      /* performance only */
90             *u = (cffi_char32_t)*w;
91         }
92         else {
93             while (size > 0) {
94                 cffi_char32_t ch = *w++;
95                 size--;
96                 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
97                     cffi_char32_t ch2 = *w;
98                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
99                         ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
100                         w++;
101                         size--;
102                     }
103                 }
104                 *u++ = ch;
105             }
106             if (PyUnicode_Resize(&result, u - u_base) < 0) {
107                 Py_DECREF(result);
108                 return NULL;
109             }
110         }
111     }
112     return result;
113 }
114 
115 #endif
116 
117 
118 #define IS_SURROGATE(u)   (0xD800 <= (u)[0] && (u)[0] <= 0xDBFF &&   \
119                            0xDC00 <= (u)[1] && (u)[1] <= 0xDFFF)
120 #define AS_SURROGATE(u)   (0x10000 + (((u)[0] - 0xD800) << 10) +     \
121                                      ((u)[1] - 0xDC00))
122 
123 static int
_my_PyUnicode_AsSingleChar16(PyObject * unicode,cffi_char16_t * result,char * err_got)124 _my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
125                              char *err_got)
126 {
127     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
128     if (PyUnicode_GET_SIZE(unicode) != 1) {
129         sprintf(err_got, "unicode string of length %zd",
130                 PyUnicode_GET_SIZE(unicode));
131         return -1;
132     }
133 #if Py_UNICODE_SIZE == 4
134     if (((unsigned int)u[0]) > 0xFFFF)
135     {
136         sprintf(err_got, "larger-than-0xFFFF character");
137         return -1;
138     }
139 #endif
140     *result = (cffi_char16_t)u[0];
141     return 0;
142 }
143 
144 static int
_my_PyUnicode_AsSingleChar32(PyObject * unicode,cffi_char32_t * result,char * err_got)145 _my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
146                              char *err_got)
147 {
148     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
149     if (PyUnicode_GET_SIZE(unicode) == 1) {
150         *result = (cffi_char32_t)u[0];
151         return 0;
152     }
153 #if Py_UNICODE_SIZE == 2
154     if (PyUnicode_GET_SIZE(unicode) == 2 && IS_SURROGATE(u)) {
155         *result = AS_SURROGATE(u);
156         return 0;
157     }
158 #endif
159     sprintf(err_got, "unicode string of length %zd",
160             PyUnicode_GET_SIZE(unicode));
161     return -1;
162 }
163 
_my_PyUnicode_SizeAsChar16(PyObject * unicode)164 static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
165 {
166     Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
167     Py_ssize_t result = length;
168 
169 #if Py_UNICODE_SIZE == 4
170     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
171     Py_ssize_t i;
172 
173     for (i=0; i<length; i++) {
174         if (u[i] > 0xFFFF)
175             result++;
176     }
177 #endif
178     return result;
179 }
180 
_my_PyUnicode_SizeAsChar32(PyObject * unicode)181 static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
182 {
183     Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
184     Py_ssize_t result = length;
185 
186 #if Py_UNICODE_SIZE == 2
187     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
188     Py_ssize_t i;
189 
190     for (i=0; i<length-1; i++) {
191         if (IS_SURROGATE(u+i))
192             result--;
193     }
194 #endif
195     return result;
196 }
197 
_my_PyUnicode_AsChar16(PyObject * unicode,cffi_char16_t * result,Py_ssize_t resultlen)198 static int _my_PyUnicode_AsChar16(PyObject *unicode,
199                                   cffi_char16_t *result,
200                                   Py_ssize_t resultlen)
201 {
202     Py_ssize_t len = PyUnicode_GET_SIZE(unicode);
203     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
204     Py_ssize_t i;
205     for (i=0; i<len; i++) {
206 #if Py_UNICODE_SIZE == 2
207         cffi_char16_t ordinal = u[i];
208 #else
209         cffi_char32_t ordinal = u[i];
210         if (ordinal > 0xFFFF) {
211             if (ordinal > 0x10FFFF) {
212                 PyErr_Format(PyExc_ValueError,
213                              "unicode character out of range for "
214                              "conversion to char16_t: 0x%x", (int)ordinal);
215                 return -1;
216             }
217             ordinal -= 0x10000;
218             *result++ = 0xD800 | (ordinal >> 10);
219             *result++ = 0xDC00 | (ordinal & 0x3FF);
220             continue;
221         }
222 #endif
223         *result++ = ordinal;
224     }
225     return 0;
226 }
227 
_my_PyUnicode_AsChar32(PyObject * unicode,cffi_char32_t * result,Py_ssize_t resultlen)228 static int _my_PyUnicode_AsChar32(PyObject *unicode,
229                                   cffi_char32_t *result,
230                                   Py_ssize_t resultlen)
231 {
232     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
233     Py_ssize_t i;
234     for (i=0; i<resultlen; i++) {
235         cffi_char32_t ordinal = *u;
236 #if Py_UNICODE_SIZE == 2
237         if (IS_SURROGATE(u)) {
238             ordinal = AS_SURROGATE(u);
239             u++;
240         }
241 #endif
242         result[i] = ordinal;
243         u++;
244     }
245     return 0;
246 }
247