1 /*
2  * _codecs_kr.c: Codecs collection for Korean encodings
3  *
4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
5  */
6 
7 #include "cjkcodecs.h"
8 #include "mappings_kr.h"
9 
10 /*
11  * EUC-KR codec
12  */
13 
14 #define EUCKR_JAMO_FIRSTBYTE    0xA4
15 #define EUCKR_JAMO_FILLER       0xD4
16 
17 static const unsigned char u2cgk_choseong[19] = {
18     0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
19     0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
20     0xbc, 0xbd, 0xbe
21 };
22 static const unsigned char u2cgk_jungseong[21] = {
23     0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
24     0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
25     0xcf, 0xd0, 0xd1, 0xd2, 0xd3
26 };
27 static const unsigned char u2cgk_jongseong[28] = {
28     0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
29     0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
30     0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
31     0xbb, 0xbc, 0xbd, 0xbe
32 };
33 
ENCODER(euc_kr)34 ENCODER(euc_kr)
35 {
36     while (*inpos < inlen) {
37         Py_UCS4 c = INCHAR1;
38         DBCHAR code;
39 
40         if (c < 0x80) {
41             WRITEBYTE1((unsigned char)c);
42             NEXT(1, 1);
43             continue;
44         }
45 
46         if (c > 0xFFFF)
47             return 1;
48 
49         REQUIRE_OUTBUF(2);
50         if (TRYMAP_ENC(cp949, code, c))
51             ;
52         else
53             return 1;
54 
55         if ((code & 0x8000) == 0) {
56             /* KS X 1001 coded character */
57             OUTBYTE1((code >> 8) | 0x80);
58             OUTBYTE2((code & 0xFF) | 0x80);
59             NEXT(1, 2);
60         }
61         else {
62             /* Mapping is found in CP949 extension,
63                but we encode it in KS X 1001:1998 Annex 3,
64                make-up sequence for EUC-KR. */
65 
66             REQUIRE_OUTBUF(8);
67 
68             /* syllable composition precedence */
69             OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
70             OUTBYTE2(EUCKR_JAMO_FILLER);
71 
72             /* All code points in CP949 extension are in unicode
73              * Hangul Syllable area. */
74             assert(0xac00 <= c && c <= 0xd7a3);
75             c -= 0xac00;
76 
77             OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
78             OUTBYTE4(u2cgk_choseong[c / 588]);
79             NEXT_OUT(4);
80 
81             OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
82             OUTBYTE2(u2cgk_jungseong[(c / 28) % 21]);
83             OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
84             OUTBYTE4(u2cgk_jongseong[c % 28]);
85             NEXT(1, 4);
86         }
87     }
88 
89     return 0;
90 }
91 
92 #define NONE    127
93 
94 static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
95        0,    1, NONE,    2, NONE, NONE,    3,    4,
96        5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
97        6,    7,    8, NONE,    9,   10,   11,   12,
98       13,   14,   15,   16,   17,   18
99 };
100 static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
101        1,    2,    3,    4,    5,    6,    7, NONE,
102        8,    9,   10,   11,   12,   13,   14,   15,
103       16,   17, NONE,   18,   19,   20,   21,   22,
104     NONE,   23,   24,   25,   26,   27
105 };
106 
DECODER(euc_kr)107 DECODER(euc_kr)
108 {
109     while (inleft > 0) {
110         unsigned char c = INBYTE1;
111         Py_UCS4 decoded;
112 
113         if (c < 0x80) {
114             OUTCHAR(c);
115             NEXT_IN(1);
116             continue;
117         }
118 
119         REQUIRE_INBUF(2);
120 
121         if (c == EUCKR_JAMO_FIRSTBYTE &&
122             INBYTE2 == EUCKR_JAMO_FILLER) {
123             /* KS X 1001:1998 Annex 3 make-up sequence */
124             DBCHAR cho, jung, jong;
125 
126             REQUIRE_INBUF(8);
127             if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
128                 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
129                 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
130                 return 1;
131 
132             c = (*inbuf)[3];
133             if (0xa1 <= c && c <= 0xbe)
134                 cho = cgk2u_choseong[c - 0xa1];
135             else
136                 cho = NONE;
137 
138             c = (*inbuf)[5];
139             jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
140 
141             c = (*inbuf)[7];
142             if (c == EUCKR_JAMO_FILLER)
143                 jong = 0;
144             else if (0xa1 <= c && c <= 0xbe)
145                 jong = cgk2u_jongseong[c - 0xa1];
146             else
147                 jong = NONE;
148 
149             if (cho == NONE || jung == NONE || jong == NONE)
150                 return 1;
151 
152             OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
153             NEXT_IN(8);
154         }
155         else if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
156             OUTCHAR(decoded);
157             NEXT_IN(2);
158         }
159         else
160             return 1;
161     }
162 
163     return 0;
164 }
165 #undef NONE
166 
167 
168 /*
169  * CP949 codec
170  */
171 
ENCODER(cp949)172 ENCODER(cp949)
173 {
174     while (*inpos < inlen) {
175         Py_UCS4 c = INCHAR1;
176         DBCHAR code;
177 
178         if (c < 0x80) {
179             WRITEBYTE1((unsigned char)c);
180             NEXT(1, 1);
181             continue;
182         }
183 
184         if (c > 0xFFFF)
185             return 1;
186 
187         REQUIRE_OUTBUF(2);
188         if (TRYMAP_ENC(cp949, code, c))
189             ;
190         else
191             return 1;
192 
193         OUTBYTE1((code >> 8) | 0x80);
194         if (code & 0x8000)
195             OUTBYTE2(code & 0xFF); /* MSB set: CP949 */
196         else
197             OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: ks x 1001 */
198         NEXT(1, 2);
199     }
200 
201     return 0;
202 }
203 
DECODER(cp949)204 DECODER(cp949)
205 {
206     while (inleft > 0) {
207         unsigned char c = INBYTE1;
208         Py_UCS4 decoded;
209 
210         if (c < 0x80) {
211             OUTCHAR(c);
212             NEXT_IN(1);
213             continue;
214         }
215 
216         REQUIRE_INBUF(2);
217         if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80))
218             OUTCHAR(decoded);
219         else if (TRYMAP_DEC(cp949ext, decoded, c, INBYTE2))
220             OUTCHAR(decoded);
221         else
222             return 1;
223 
224         NEXT_IN(2);
225     }
226 
227     return 0;
228 }
229 
230 
231 /*
232  * JOHAB codec
233  */
234 
235 static const unsigned char u2johabidx_choseong[32] = {
236                 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
237     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
238     0x10, 0x11, 0x12, 0x13, 0x14,
239 };
240 static const unsigned char u2johabidx_jungseong[32] = {
241                       0x03, 0x04, 0x05, 0x06, 0x07,
242                 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
243                 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
244                 0x1a, 0x1b, 0x1c, 0x1d,
245 };
246 static const unsigned char u2johabidx_jongseong[32] = {
247           0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
248     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
249     0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
250     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
251 };
252 static const DBCHAR u2johabjamo[] = {
253             0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
254     0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
255     0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
256     0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
257     0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
258     0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
259     0x8741, 0x8761, 0x8781, 0x87a1,
260 };
261 
ENCODER(johab)262 ENCODER(johab)
263 {
264     while (*inpos < inlen) {
265         Py_UCS4 c = INCHAR1;
266         DBCHAR code;
267 
268         if (c < 0x80) {
269             WRITEBYTE1((unsigned char)c);
270             NEXT(1, 1);
271             continue;
272         }
273 
274         if (c > 0xFFFF)
275             return 1;
276 
277         REQUIRE_OUTBUF(2);
278 
279         if (c >= 0xac00 && c <= 0xd7a3) {
280             c -= 0xac00;
281             code = 0x8000 |
282                 (u2johabidx_choseong[c / 588] << 10) |
283                 (u2johabidx_jungseong[(c / 28) % 21] << 5) |
284                 u2johabidx_jongseong[c % 28];
285         }
286         else if (c >= 0x3131 && c <= 0x3163)
287             code = u2johabjamo[c - 0x3131];
288         else if (TRYMAP_ENC(cp949, code, c)) {
289             unsigned char c1, c2, t2;
290             unsigned short t1;
291 
292             assert((code & 0x8000) == 0);
293             c1 = code >> 8;
294             c2 = code & 0xff;
295             if (((c1 >= 0x21 && c1 <= 0x2c) ||
296                 (c1 >= 0x4a && c1 <= 0x7d)) &&
297                 (c2 >= 0x21 && c2 <= 0x7e)) {
298                 t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
299                           (c1 - 0x21 + 0x197));
300                 t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
301                 OUTBYTE1(t1 >> 1);
302                 OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43);
303                 NEXT(1, 2);
304                 continue;
305             }
306             else
307                 return 1;
308         }
309         else
310             return 1;
311 
312         OUTBYTE1(code >> 8);
313         OUTBYTE2(code & 0xff);
314         NEXT(1, 2);
315     }
316 
317     return 0;
318 }
319 
320 #define FILL 0xfd
321 #define NONE 0xff
322 
323 static const unsigned char johabidx_choseong[32] = {
324     NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
325     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
326     0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
327     NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
328 };
329 static const unsigned char johabidx_jungseong[32] = {
330     NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
331     NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
332     NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
333     NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
334 };
335 static const unsigned char johabidx_jongseong[32] = {
336     NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
337     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
338     0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
339     0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
340 };
341 
342 static const unsigned char johabjamo_choseong[32] = {
343     NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
344     0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
345     0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
346     NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
347 };
348 static const unsigned char johabjamo_jungseong[32] = {
349     NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
350     NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
351     NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
352     NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
353 };
354 static const unsigned char johabjamo_jongseong[32] = {
355     NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
356     0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
357     0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
358     0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
359 };
360 
DECODER(johab)361 DECODER(johab)
362 {
363     while (inleft > 0) {
364         unsigned char c = INBYTE1, c2;
365         Py_UCS4 decoded;
366 
367         if (c < 0x80) {
368             OUTCHAR(c);
369             NEXT_IN(1);
370             continue;
371         }
372 
373         REQUIRE_INBUF(2);
374         c2 = INBYTE2;
375 
376         if (c < 0xd8) {
377             /* johab hangul */
378             unsigned char c_cho, c_jung, c_jong;
379             unsigned char i_cho, i_jung, i_jong;
380 
381             c_cho = (c >> 2) & 0x1f;
382             c_jung = ((c << 3) | c2 >> 5) & 0x1f;
383             c_jong = c2 & 0x1f;
384 
385             i_cho = johabidx_choseong[c_cho];
386             i_jung = johabidx_jungseong[c_jung];
387             i_jong = johabidx_jongseong[c_jong];
388 
389             if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
390                 return 1;
391 
392             /* we don't use U+1100 hangul jamo yet. */
393             if (i_cho == FILL) {
394                 if (i_jung == FILL) {
395                     if (i_jong == FILL)
396                         OUTCHAR(0x3000);
397                     else
398                         OUTCHAR(0x3100 |
399                             johabjamo_jongseong[c_jong]);
400                 }
401                 else {
402                     if (i_jong == FILL)
403                         OUTCHAR(0x3100 |
404                             johabjamo_jungseong[c_jung]);
405                     else
406                         return 1;
407                 }
408             } else {
409                 if (i_jung == FILL) {
410                     if (i_jong == FILL)
411                         OUTCHAR(0x3100 |
412                             johabjamo_choseong[c_cho]);
413                     else
414                         return 1;
415                 }
416                 else
417                     OUTCHAR(0xac00 +
418                         i_cho * 588 +
419                         i_jung * 28 +
420                         (i_jong == FILL ? 0 : i_jong));
421             }
422             NEXT_IN(2);
423         } else {
424             /* KS X 1001 except hangul jamos and syllables */
425             if (c == 0xdf || c > 0xf9 ||
426                 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
427                 (c2 & 0x7f) == 0x7f ||
428                 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
429                 return 1;
430             else {
431                 unsigned char t1, t2;
432 
433                 t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
434                          2 * c - 0x197);
435                 t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
436                 t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
437                 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
438 
439                 if (TRYMAP_DEC(ksx1001, decoded, t1, t2)) {
440                     OUTCHAR(decoded);
441                     NEXT_IN(2);
442                 }
443                 else {
444                     return 1;
445                 }
446             }
447         }
448     }
449 
450     return 0;
451 }
452 #undef NONE
453 #undef FILL
454 
455 
456 BEGIN_MAPPINGS_LIST
457   MAPPING_DECONLY(ksx1001)
458   MAPPING_ENCONLY(cp949)
459   MAPPING_DECONLY(cp949ext)
460 END_MAPPINGS_LIST
461 
462 BEGIN_CODECS_LIST
463   CODEC_STATELESS(euc_kr)
464   CODEC_STATELESS(cp949)
465   CODEC_STATELESS(johab)
466 END_CODECS_LIST
467 
468 I_AM_A_MODULE_FOR(kr)
469