1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2000-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv2022.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000feb03
14 * created by: Markus W. Scherer
15 *
16 * Change history:
17 *
18 * 06/29/2000 helena Major rewrite of the callback APIs.
19 * 08/08/2000 Ram Included support for ISO-2022-JP-2
20 * Changed implementation of toUnicode
21 * function
22 * 08/21/2000 Ram Added support for ISO-2022-KR
23 * 08/29/2000 Ram Seperated implementation of EBCDIC to
24 * ucnvebdc.c
25 * 09/20/2000 Ram Added support for ISO-2022-CN
26 * Added implementations for getNextUChar()
27 * for specific 2022 country variants.
28 * 10/31/2000 Ram Implemented offsets logic functions
29 */
30
31 #include "unicode/utypes.h"
32
33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35 #include "unicode/ucnv.h"
36 #include "unicode/uset.h"
37 #include "unicode/ucnv_err.h"
38 #include "unicode/ucnv_cb.h"
39 #include "unicode/utf16.h"
40 #include "ucnv_imp.h"
41 #include "ucnv_bld.h"
42 #include "ucnv_cnv.h"
43 #include "ucnvmbcs.h"
44 #include "cstring.h"
45 #include "cmemory.h"
46 #include "uassert.h"
47
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78 #endif
79
80 #if !UCONFIG_ONLY_HTML_CONVERSION
81 static const char SHIFT_IN_STR[] = "\x0F";
82 // static const char SHIFT_OUT_STR[] = "\x0E";
83 #endif
84
85 #define CR 0x0D
86 #define LF 0x0A
87 #define H_TAB 0x09
88 #define V_TAB 0x0B
89 #define SPACE 0x20
90
91 enum {
92 HWKANA_START=0xff61,
93 HWKANA_END=0xff9f
94 };
95
96 /*
97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98 * as bytes 21..7E. (Subtract 0x80.)
99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100 * as bytes 20..7F. (Subtract 0x80.)
101 * Do not encode C1 control codes with native bytes 80..9F
102 * as bytes 00..1F (C0 control codes).
103 */
104 enum {
105 GR94_START=0xa1,
106 GR94_END=0xfe,
107 GR96_START=0xa0,
108 GR96_END=0xff
109 };
110
111 /*
112 * ISO 2022 control codes must not be converted from Unicode
113 * because they would mess up the byte stream.
114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115 * corresponding to SO, SI, and ESC.
116 */
117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119 /* for ISO-2022-JP and -CN implementations */
120 typedef enum {
121 /* shared values */
122 INVALID_STATE=-1,
123 ASCII = 0,
124
125 SS2_STATE=0x10,
126 SS3_STATE,
127
128 /* JP */
129 ISO8859_1 = 1 ,
130 ISO8859_7 = 2 ,
131 JISX201 = 3,
132 JISX208 = 4,
133 JISX212 = 5,
134 GB2312 =6,
135 KSC5601 =7,
136 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
137
138 /* CN */
139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140 GB2312_1=1,
141 ISO_IR_165=2,
142 CNS_11643=3,
143
144 /*
145 * these are used in StateEnum and ISO2022State variables,
146 * but CNS_11643 must be used to index into myConverterArray[]
147 */
148 CNS_11643_0=0x20,
149 CNS_11643_1,
150 CNS_11643_2,
151 CNS_11643_3,
152 CNS_11643_4,
153 CNS_11643_5,
154 CNS_11643_6,
155 CNS_11643_7
156 } StateEnum;
157
158 /* is the StateEnum charset value for a DBCS charset? */
159 #if UCONFIG_ONLY_HTML_CONVERSION
160 #define IS_JP_DBCS(cs) (JISX208==(cs))
161 #else
162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163 #endif
164
165 #define CSM(cs) ((uint16_t)1<<(cs))
166
167 /*
168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170 *
171 * Note: The converter uses some leniency:
172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173 * all versions, not just JIS7 and JIS8.
174 * - ICU does not distinguish between different versions of JIS X 0208.
175 */
176 #if UCONFIG_ONLY_HTML_CONVERSION
177 enum { MAX_JA_VERSION=0 };
178 #else
179 enum { MAX_JA_VERSION=4 };
180 #endif
181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183 #if !UCONFIG_ONLY_HTML_CONVERSION
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188 #endif
189 };
190
191 typedef enum {
192 ASCII1=0,
193 LATIN1,
194 SBCS,
195 DBCS,
196 MBCS,
197 HWKANA
198 }Cnv2022Type;
199
200 typedef struct ISO2022State {
201 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203 int8_t prevG; /* g before single shift (SS2 or SS3) */
204 } ISO2022State;
205
206 #define UCNV_OPTIONS_VERSION_MASK 0xf
207 #define UCNV_2022_MAX_CONVERTERS 10
208
209 typedef struct{
210 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211 UConverter *currentConverter;
212 Cnv2022Type currentType;
213 ISO2022State toU2022State, fromU2022State;
214 uint32_t key;
215 uint32_t version;
216 #ifdef U_ENABLE_GENERIC_ISO_2022
217 UBool isFirstBuffer;
218 #endif
219 UBool isEmptySegment;
220 char name[30];
221 char locale[3];
222 }UConverterDataISO2022;
223
224 /* Protos */
225 /* ISO-2022 ----------------------------------------------------------------- */
226
227 /*Forward declaration */
228 U_CFUNC void U_CALLCONV
229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230 UErrorCode * err);
231 U_CFUNC void U_CALLCONV
232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233 UErrorCode * err);
234
235 #define ESC_2022 0x1B /*ESC*/
236
237 typedef enum
238 {
239 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243 } UCNV_TableStates_2022;
244
245 /*
246 * The way these state transition arrays work is:
247 * ex : ESC$B is the sequence for JISX208
248 * a) First Iteration: char is ESC
249 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250 * int x = normalize_esq_chars_2022[27] which is equal to 1
251 * ii) Search for this value in escSeqStateTable_Key_2022[]
252 * value of x is stored at escSeqStateTable_Key_2022[0]
253 * iii) Save this index as offset
254 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256 * b) Switch on this state and continue to next char
257 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258 * which is normalize_esq_chars_2022[36] == 4
259 * ii) x is currently 1(from above)
260 * x<<=5 -- x is now 32
261 * x+=normalize_esq_chars_2022[36]
262 * now x is 36
263 * iii) Search for this value in escSeqStateTable_Key_2022[]
264 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267 * c) Switch on this state and continue to next char
268 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269 * ii) x is currently 36 (from above)
270 * x<<=5 -- x is now 1152
271 * x+=normalize_esq_chars_2022[66]
272 * now x is 1161
273 * iii) Search for this value in escSeqStateTable_Key_2022[]
274 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278 */
279
280
281 /*Below are the 3 arrays depicting a state transition table*/
282 static const int8_t normalize_esq_chars_2022[256] = {
283 /* 0 1 2 3 4 5 6 7 8 9 */
284
285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
310 ,0 ,0 ,0 ,0 ,0 ,0
311 };
312
313 #ifdef U_ENABLE_GENERIC_ISO_2022
314 /*
315 * When the generic ISO-2022 converter is completely removed, not just disabled
316 * per #ifdef, then the following state table and the associated tables that are
317 * dimensioned with MAX_STATES_2022 should be trimmed.
318 *
319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320 * the associated escape sequences starting with ESC ( B should be removed.
321 * This includes the ones with key values 1097 and all of the ones above 1000000.
322 *
323 * For the latter, the tables can simply be truncated.
324 * For the former, since the tables must be kept parallel, it is probably best
325 * to simply duplicate an adjacent table cell, parallel in all tables.
326 *
327 * It may make sense to restructure the tables, especially by using small search
328 * tables for the variants instead of indexing them parallel to the table here.
329 */
330 #endif
331
332 #define MAX_STATES_2022 74
333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334 /* 0 1 2 3 4 5 6 7 8 9 */
335
336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
343 ,35947631 ,35947635 ,35947636 ,35947638
344 };
345
346 #ifdef U_ENABLE_GENERIC_ISO_2022
347
348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
350
351 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
353 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
359 };
360
361 #endif
362
363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364 /* 0 1 2 3 4 5 6 7 8 9 */
365 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
366 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
373 };
374
375 /* Type def for refactoring changeState_2022 code*/
376 typedef enum{
377 #ifdef U_ENABLE_GENERIC_ISO_2022
378 ISO_2022=0,
379 #endif
380 ISO_2022_JP=1,
381 #if !UCONFIG_ONLY_HTML_CONVERSION
382 ISO_2022_KR=2,
383 ISO_2022_CN=3
384 #endif
385 } Variant2022;
386
387 /*********** ISO 2022 Converter Protos ***********/
388 static void U_CALLCONV
389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391 static void U_CALLCONV
392 _ISO2022Close(UConverter *converter);
393
394 static void U_CALLCONV
395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397 U_CDECL_BEGIN
398 static const char * U_CALLCONV
399 _ISO2022getName(const UConverter* cnv);
400 U_CDECL_END
401
402 static void U_CALLCONV
403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405 U_CDECL_BEGIN
406 static UConverter * U_CALLCONV
407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409 U_CDECL_END
410
411 #ifdef U_ENABLE_GENERIC_ISO_2022
412 static void U_CALLCONV
413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414 #endif
415
416 namespace {
417
418 /*const UConverterSharedData _ISO2022Data;*/
419 extern const UConverterSharedData _ISO2022JPData;
420
421 #if !UCONFIG_ONLY_HTML_CONVERSION
422 extern const UConverterSharedData _ISO2022KRData;
423 extern const UConverterSharedData _ISO2022CNData;
424 #endif
425
426 } // namespace
427
428 /*************** Converter implementations ******************/
429
430 /* The purpose of this function is to get around gcc compiler warnings. */
431 static inline void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)432 fromUWriteUInt8(UConverter *cnv,
433 const char *bytes, int32_t length,
434 uint8_t **target, const char *targetLimit,
435 int32_t **offsets,
436 int32_t sourceIndex,
437 UErrorCode *pErrorCode)
438 {
439 char *targetChars = (char *)*target;
440 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441 offsets, sourceIndex, pErrorCode);
442 *target = (uint8_t*)targetChars;
443
444 }
445
446 static inline void
setInitialStateToUnicodeKR(UConverter *,UConverterDataISO2022 * myConverterData)447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448 if(myConverterData->version == 1) {
449 UConverter *cnv = myConverterData->currentConverter;
450
451 cnv->toUnicodeStatus=0; /* offset */
452 cnv->mode=0; /* state */
453 cnv->toULength=0; /* byteIndex */
454 }
455 }
456
457 static inline void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459 /* in ISO-2022-KR the designator sequence appears only once
460 * in a file so we append it only once
461 */
462 if( converter->charErrorBufferLength==0){
463
464 converter->charErrorBufferLength = 4;
465 converter->charErrorBuffer[0] = 0x1b;
466 converter->charErrorBuffer[1] = 0x24;
467 converter->charErrorBuffer[2] = 0x29;
468 converter->charErrorBuffer[3] = 0x43;
469 }
470 if(myConverterData->version == 1) {
471 UConverter *cnv = myConverterData->currentConverter;
472
473 cnv->fromUChar32=0;
474 cnv->fromUnicodeStatus=1; /* prevLength */
475 }
476 }
477
478 static void U_CALLCONV
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481 char myLocale[6]={' ',' ',' ',' ',' ',' '};
482
483 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484 if(cnv->extraInfo != NULL) {
485 UConverterNamePieces stackPieces;
486 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488 uint32_t version;
489
490 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493 myConverterData->currentType = ASCII1;
494 cnv->fromUnicodeStatus =FALSE;
495 if(pArgs->locale){
496 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
497 }
498 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499 myConverterData->version = version;
500 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501 (myLocale[2]=='_' || myLocale[2]=='\0'))
502 {
503 /* open the required converters and cache them */
504 if(version>MAX_JA_VERSION) {
505 // ICU 55 fails to open a converter for an unsupported version.
506 // Previously, it fell back to version 0, but that would yield
507 // unexpected behavior.
508 *errorCode = U_MISSING_RESOURCE_ERROR;
509 return;
510 }
511 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512 myConverterData->myConverterArray[ISO8859_7] =
513 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514 }
515 myConverterData->myConverterArray[JISX208] =
516 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517 if(jpCharsetMasks[version]&CSM(JISX212)) {
518 myConverterData->myConverterArray[JISX212] =
519 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520 }
521 if(jpCharsetMasks[version]&CSM(GB2312)) {
522 myConverterData->myConverterArray[GB2312] =
523 /* BEGIN android-changed */
524 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
525 /* END android-changed */
526 }
527 if(jpCharsetMasks[version]&CSM(KSC5601)) {
528 myConverterData->myConverterArray[KSC5601] =
529 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
530 }
531
532 /* set the function pointers to appropriate funtions */
533 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
534 uprv_strcpy(myConverterData->locale,"ja");
535
536 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
537 size_t len = uprv_strlen(myConverterData->name);
538 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
539 myConverterData->name[len+1]='\0';
540 }
541 #if !UCONFIG_ONLY_HTML_CONVERSION
542 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
543 (myLocale[2]=='_' || myLocale[2]=='\0'))
544 {
545 if(version>1) {
546 // ICU 55 fails to open a converter for an unsupported version.
547 // Previously, it fell back to version 0, but that would yield
548 // unexpected behavior.
549 *errorCode = U_MISSING_RESOURCE_ERROR;
550 return;
551 }
552 const char *cnvName;
553 if(version==1) {
554 cnvName="icu-internal-25546";
555 } else {
556 /* BEGIN android-changed */
557 cnvName="ksc_5601";
558 /* END android-changed */
559 myConverterData->version=version=0;
560 }
561 if(pArgs->onlyTestIsLoadable) {
562 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
563 uprv_free(cnv->extraInfo);
564 cnv->extraInfo=NULL;
565 return;
566 } else {
567 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
568 if (U_FAILURE(*errorCode)) {
569 _ISO2022Close(cnv);
570 return;
571 }
572
573 if(version==1) {
574 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
575 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
576 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
577 }else{
578 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
579 }
580
581 /* initialize the state variables */
582 setInitialStateToUnicodeKR(cnv, myConverterData);
583 setInitialStateFromUnicodeKR(cnv, myConverterData);
584
585 /* set the function pointers to appropriate funtions */
586 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
587 uprv_strcpy(myConverterData->locale,"ko");
588 }
589 }
590 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
591 (myLocale[2]=='_' || myLocale[2]=='\0'))
592 {
593 if(version>2) {
594 // ICU 55 fails to open a converter for an unsupported version.
595 // Previously, it fell back to version 0, but that would yield
596 // unexpected behavior.
597 *errorCode = U_MISSING_RESOURCE_ERROR;
598 return;
599 }
600
601 /* open the required converters and cache them */
602 /* BEGIN android-changed */
603 myConverterData->myConverterArray[GB2312_1] =
604 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
605 if(version==1) {
606 myConverterData->myConverterArray[ISO_IR_165] =
607 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
608 }
609 myConverterData->myConverterArray[CNS_11643] =
610 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
611 /* END android-changed */
612
613
614 /* set the function pointers to appropriate funtions */
615 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
616 uprv_strcpy(myConverterData->locale,"cn");
617
618 if (version==0){
619 myConverterData->version = 0;
620 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
621 }else if (version==1){
622 myConverterData->version = 1;
623 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
624 }else {
625 myConverterData->version = 2;
626 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
627 }
628 }
629 #endif // !UCONFIG_ONLY_HTML_CONVERSION
630 else{
631 #ifdef U_ENABLE_GENERIC_ISO_2022
632 myConverterData->isFirstBuffer = TRUE;
633
634 /* append the UTF-8 escape sequence */
635 cnv->charErrorBufferLength = 3;
636 cnv->charErrorBuffer[0] = 0x1b;
637 cnv->charErrorBuffer[1] = 0x25;
638 cnv->charErrorBuffer[2] = 0x42;
639
640 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
641 /* initialize the state variables */
642 uprv_strcpy(myConverterData->name,"ISO_2022");
643 #else
644 *errorCode = U_MISSING_RESOURCE_ERROR;
645 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
646 // data loading error code.
647 return;
648 #endif
649 }
650
651 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
652
653 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
654 _ISO2022Close(cnv);
655 }
656 } else {
657 *errorCode = U_MEMORY_ALLOCATION_ERROR;
658 }
659 }
660
661
662 static void U_CALLCONV
_ISO2022Close(UConverter * converter)663 _ISO2022Close(UConverter *converter) {
664 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
665 UConverterSharedData **array = myData->myConverterArray;
666 int32_t i;
667
668 if (converter->extraInfo != NULL) {
669 /*close the array of converter pointers and free the memory*/
670 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
671 if(array[i]!=NULL) {
672 ucnv_unloadSharedDataIfReady(array[i]);
673 }
674 }
675
676 ucnv_close(myData->currentConverter);
677
678 if(!converter->isExtraLocal){
679 uprv_free (converter->extraInfo);
680 converter->extraInfo = NULL;
681 }
682 }
683 }
684
685 static void U_CALLCONV
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)686 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
687 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
688 if(choice<=UCNV_RESET_TO_UNICODE) {
689 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
690 myConverterData->key = 0;
691 myConverterData->isEmptySegment = FALSE;
692 }
693 if(choice!=UCNV_RESET_TO_UNICODE) {
694 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
695 }
696 #ifdef U_ENABLE_GENERIC_ISO_2022
697 if(myConverterData->locale[0] == 0){
698 if(choice<=UCNV_RESET_TO_UNICODE) {
699 myConverterData->isFirstBuffer = TRUE;
700 myConverterData->key = 0;
701 if (converter->mode == UCNV_SO){
702 ucnv_close (myConverterData->currentConverter);
703 myConverterData->currentConverter=NULL;
704 }
705 converter->mode = UCNV_SI;
706 }
707 if(choice!=UCNV_RESET_TO_UNICODE) {
708 /* re-append UTF-8 escape sequence */
709 converter->charErrorBufferLength = 3;
710 converter->charErrorBuffer[0] = 0x1b;
711 converter->charErrorBuffer[1] = 0x28;
712 converter->charErrorBuffer[2] = 0x42;
713 }
714 }
715 else
716 #endif
717 {
718 /* reset the state variables */
719 if(myConverterData->locale[0] == 'k'){
720 if(choice<=UCNV_RESET_TO_UNICODE) {
721 setInitialStateToUnicodeKR(converter, myConverterData);
722 }
723 if(choice!=UCNV_RESET_TO_UNICODE) {
724 setInitialStateFromUnicodeKR(converter, myConverterData);
725 }
726 }
727 }
728 }
729
730 U_CDECL_BEGIN
731
732 static const char * U_CALLCONV
_ISO2022getName(const UConverter * cnv)733 _ISO2022getName(const UConverter* cnv){
734 if(cnv->extraInfo){
735 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
736 return myData->name;
737 }
738 return NULL;
739 }
740
741 U_CDECL_END
742
743
744 /*************** to unicode *******************/
745 /****************************************************************************
746 * Recognized escape sequences are
747 * <ESC>(B ASCII
748 * <ESC>.A ISO-8859-1
749 * <ESC>.F ISO-8859-7
750 * <ESC>(J JISX-201
751 * <ESC>(I JISX-201
752 * <ESC>$B JISX-208
753 * <ESC>$@ JISX-208
754 * <ESC>$(D JISX-212
755 * <ESC>$A GB2312
756 * <ESC>$(C KSC5601
757 */
758 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
759 /* 0 1 2 3 4 5 6 7 8 9 */
760 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
762 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
763 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
764 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
765 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
766 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
767 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
768 };
769
770 #if !UCONFIG_ONLY_HTML_CONVERSION
771 /*************** to unicode *******************/
772 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
773 /* 0 1 2 3 4 5 6 7 8 9 */
774 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
776 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
777 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
778 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
779 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
780 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
781 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
782 };
783 #endif
784
785
786 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)787 getKey_2022(char c,int32_t* key,int32_t* offset){
788 int32_t togo;
789 int32_t low = 0;
790 int32_t hi = MAX_STATES_2022;
791 int32_t oldmid=0;
792
793 togo = normalize_esq_chars_2022[(uint8_t)c];
794 if(togo == 0) {
795 /* not a valid character anywhere in an escape sequence */
796 *key = 0;
797 *offset = 0;
798 return INVALID_2022;
799 }
800 togo = (*key << 5) + togo;
801
802 while (hi != low) /*binary search*/{
803
804 int32_t mid = (hi+low) >> 1; /*Finds median*/
805
806 if (mid == oldmid)
807 break;
808
809 if (escSeqStateTable_Key_2022[mid] > togo){
810 hi = mid;
811 }
812 else if (escSeqStateTable_Key_2022[mid] < togo){
813 low = mid;
814 }
815 else /*we found it*/{
816 *key = togo;
817 *offset = mid;
818 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
819 }
820 oldmid = mid;
821
822 }
823
824 *key = 0;
825 *offset = 0;
826 return INVALID_2022;
827 }
828
829 /*runs through a state machine to determine the escape sequence - codepage correspondance
830 */
831 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)832 changeState_2022(UConverter* _this,
833 const char** source,
834 const char* sourceLimit,
835 Variant2022 var,
836 UErrorCode* err){
837 UCNV_TableStates_2022 value;
838 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
839 uint32_t key = myData2022->key;
840 int32_t offset = 0;
841 int8_t initialToULength = _this->toULength;
842 char c;
843
844 value = VALID_NON_TERMINAL_2022;
845 while (*source < sourceLimit) {
846 c = *(*source)++;
847 _this->toUBytes[_this->toULength++]=(uint8_t)c;
848 value = getKey_2022(c,(int32_t *) &key, &offset);
849
850 switch (value){
851
852 case VALID_NON_TERMINAL_2022 :
853 /* continue with the loop */
854 break;
855
856 case VALID_TERMINAL_2022:
857 key = 0;
858 goto DONE;
859
860 case INVALID_2022:
861 goto DONE;
862
863 case VALID_MAYBE_TERMINAL_2022:
864 #ifdef U_ENABLE_GENERIC_ISO_2022
865 /* ESC ( B is ambiguous only for ISO_2022 itself */
866 if(var == ISO_2022) {
867 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
868 _this->toULength = 0;
869
870 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
871
872 /* continue with the loop */
873 value = VALID_NON_TERMINAL_2022;
874 break;
875 } else
876 #endif
877 {
878 /* not ISO_2022 itself, finish here */
879 value = VALID_TERMINAL_2022;
880 key = 0;
881 goto DONE;
882 }
883 }
884 }
885
886 DONE:
887 myData2022->key = key;
888
889 if (value == VALID_NON_TERMINAL_2022) {
890 /* indicate that the escape sequence is incomplete: key!=0 */
891 return;
892 } else if (value == INVALID_2022 ) {
893 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
894 } else /* value == VALID_TERMINAL_2022 */ {
895 switch(var){
896 #ifdef U_ENABLE_GENERIC_ISO_2022
897 case ISO_2022:
898 {
899 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
900 if(chosenConverterName == NULL) {
901 /* SS2 or SS3 */
902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
903 _this->toUCallbackReason = UCNV_UNASSIGNED;
904 return;
905 }
906
907 _this->mode = UCNV_SI;
908 ucnv_close(myData2022->currentConverter);
909 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
910 if(U_SUCCESS(*err)) {
911 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
912 _this->mode = UCNV_SO;
913 }
914 break;
915 }
916 #endif
917 case ISO_2022_JP:
918 {
919 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
920 switch(tempState) {
921 case INVALID_STATE:
922 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
923 break;
924 case SS2_STATE:
925 if(myData2022->toU2022State.cs[2]!=0) {
926 if(myData2022->toU2022State.g<2) {
927 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
928 }
929 myData2022->toU2022State.g=2;
930 } else {
931 /* illegal to have SS2 before a matching designator */
932 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
933 }
934 break;
935 /* case SS3_STATE: not used in ISO-2022-JP-x */
936 case ISO8859_1:
937 case ISO8859_7:
938 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
939 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
940 } else {
941 /* G2 charset for SS2 */
942 myData2022->toU2022State.cs[2]=(int8_t)tempState;
943 }
944 break;
945 default:
946 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
947 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
948 } else {
949 /* G0 charset */
950 myData2022->toU2022State.cs[0]=(int8_t)tempState;
951 }
952 break;
953 }
954 }
955 break;
956 #if !UCONFIG_ONLY_HTML_CONVERSION
957 case ISO_2022_CN:
958 {
959 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
960 switch(tempState) {
961 case INVALID_STATE:
962 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
963 break;
964 case SS2_STATE:
965 if(myData2022->toU2022State.cs[2]!=0) {
966 if(myData2022->toU2022State.g<2) {
967 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
968 }
969 myData2022->toU2022State.g=2;
970 } else {
971 /* illegal to have SS2 before a matching designator */
972 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
973 }
974 break;
975 case SS3_STATE:
976 if(myData2022->toU2022State.cs[3]!=0) {
977 if(myData2022->toU2022State.g<2) {
978 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
979 }
980 myData2022->toU2022State.g=3;
981 } else {
982 /* illegal to have SS3 before a matching designator */
983 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
984 }
985 break;
986 case ISO_IR_165:
987 if(myData2022->version==0) {
988 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
989 break;
990 }
991 U_FALLTHROUGH;
992 case GB2312_1:
993 U_FALLTHROUGH;
994 case CNS_11643_1:
995 myData2022->toU2022State.cs[1]=(int8_t)tempState;
996 break;
997 case CNS_11643_2:
998 myData2022->toU2022State.cs[2]=(int8_t)tempState;
999 break;
1000 default:
1001 /* other CNS 11643 planes */
1002 if(myData2022->version==0) {
1003 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1004 } else {
1005 myData2022->toU2022State.cs[3]=(int8_t)tempState;
1006 }
1007 break;
1008 }
1009 }
1010 break;
1011 case ISO_2022_KR:
1012 if(offset==0x30){
1013 /* nothing to be done, just accept this one escape sequence */
1014 } else {
1015 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1016 }
1017 break;
1018 #endif // !UCONFIG_ONLY_HTML_CONVERSION
1019
1020 default:
1021 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1022 break;
1023 }
1024 }
1025 if(U_SUCCESS(*err)) {
1026 _this->toULength = 0;
1027 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1028 if(_this->toULength>1) {
1029 /*
1030 * Ticket 5691: consistent illegal sequences:
1031 * - We include at least the first byte (ESC) in the illegal sequence.
1032 * - If any of the non-initial bytes could be the start of a character,
1033 * we stop the illegal sequence before the first one of those.
1034 * In escape sequences, all following bytes are "printable", that is,
1035 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1036 * they are valid single/lead bytes.
1037 * For simplicity, we always only report the initial ESC byte as the
1038 * illegal sequence and back out all other bytes we looked at.
1039 */
1040 /* Back out some bytes. */
1041 int8_t backOutDistance=_this->toULength-1;
1042 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1043 if(backOutDistance<=bytesFromThisBuffer) {
1044 /* same as initialToULength<=1 */
1045 *source-=backOutDistance;
1046 } else {
1047 /* Back out bytes from the previous buffer: Need to replay them. */
1048 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1049 /* same as -(initialToULength-1) */
1050 /* preToULength is negative! */
1051 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1052 *source-=bytesFromThisBuffer;
1053 }
1054 _this->toULength=1;
1055 }
1056 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1057 _this->toUCallbackReason = UCNV_UNASSIGNED;
1058 }
1059 }
1060
1061 #if !UCONFIG_ONLY_HTML_CONVERSION
1062 /*Checks the characters of the buffer against valid 2022 escape sequences
1063 *if the match we return a pointer to the initial start of the sequence otherwise
1064 *we return sourceLimit
1065 */
1066 /*for 2022 looks ahead in the stream
1067 *to determine the longest possible convertible
1068 *data stream
1069 */
1070 static inline const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool)1071 getEndOfBuffer_2022(const char** source,
1072 const char* sourceLimit,
1073 UBool /*flush*/){
1074
1075 const char* mySource = *source;
1076
1077 #ifdef U_ENABLE_GENERIC_ISO_2022
1078 if (*source >= sourceLimit)
1079 return sourceLimit;
1080
1081 do{
1082
1083 if (*mySource == ESC_2022){
1084 int8_t i;
1085 int32_t key = 0;
1086 int32_t offset;
1087 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1088
1089 /* Kludge: I could not
1090 * figure out the reason for validating an escape sequence
1091 * twice - once here and once in changeState_2022().
1092 * is it possible to have an ESC character in a ISO2022
1093 * byte stream which is valid in a code page? Is it legal?
1094 */
1095 for (i=0;
1096 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1097 i++) {
1098 value = getKey_2022(*(mySource+i), &key, &offset);
1099 }
1100 if (value > 0 || *mySource==ESC_2022)
1101 return mySource;
1102
1103 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1104 return sourceLimit;
1105 }
1106 }while (++mySource < sourceLimit);
1107
1108 return sourceLimit;
1109 #else
1110 while(mySource < sourceLimit && *mySource != ESC_2022) {
1111 ++mySource;
1112 }
1113 return mySource;
1114 #endif
1115 }
1116 #endif
1117
1118 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1119 * any future change in _MBCSFromUChar32() function should be reflected here.
1120 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1121 */
1122 static inline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1123 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1124 UChar32 c,
1125 uint32_t* value,
1126 UBool useFallback,
1127 int outputType)
1128 {
1129 const int32_t *cx;
1130 const uint16_t *table;
1131 uint32_t stage2Entry;
1132 uint32_t myValue;
1133 int32_t length;
1134 const uint8_t *p;
1135 /*
1136 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1137 * Use internal version of ucnv_open() that verifies that the new structures are available,
1138 * else U_INTERNAL_PROGRAM_ERROR.
1139 */
1140 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1141 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1142 table=sharedData->mbcs.fromUnicodeTable;
1143 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1144 /* get the bytes and the length for the output */
1145 if(outputType==MBCS_OUTPUT_2){
1146 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1147 if(myValue<=0xff) {
1148 length=1;
1149 } else {
1150 length=2;
1151 }
1152 } else /* outputType==MBCS_OUTPUT_3 */ {
1153 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1154 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1155 if(myValue<=0xff) {
1156 length=1;
1157 } else if(myValue<=0xffff) {
1158 length=2;
1159 } else {
1160 length=3;
1161 }
1162 }
1163 /* is this code point assigned, or do we use fallbacks? */
1164 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1165 /* assigned */
1166 *value=myValue;
1167 return length;
1168 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1169 /*
1170 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1171 * There is no way with this data structure for fallback output
1172 * to be a zero byte.
1173 */
1174 *value=myValue;
1175 return -length;
1176 }
1177 }
1178
1179 cx=sharedData->mbcs.extIndexes;
1180 if(cx!=NULL) {
1181 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1182 }
1183
1184 /* unassigned */
1185 return 0;
1186 }
1187
1188 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1189 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1190 * @param retval pointer to output byte
1191 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1192 */
1193 static inline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1194 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1195 UChar32 c,
1196 uint32_t* retval,
1197 UBool useFallback)
1198 {
1199 const uint16_t *table;
1200 int32_t value;
1201 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1202 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1203 return 0;
1204 }
1205 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1206 table=sharedData->mbcs.fromUnicodeTable;
1207 /* get the byte for the output */
1208 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1209 /* is this code point assigned, or do we use fallbacks? */
1210 *retval=(uint32_t)(value&0xff);
1211 if(value>=0xf00) {
1212 return 1; /* roundtrip */
1213 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1214 return -1; /* fallback taken */
1215 } else {
1216 return 0; /* no mapping */
1217 }
1218 }
1219
1220 /*
1221 * Check that the result is a 2-byte value with each byte in the range A1..FE
1222 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1223 * to move it to the ISO 2022 range 21..7E.
1224 * Return 0 if out of range.
1225 */
1226 static inline uint32_t
_2022FromGR94DBCS(uint32_t value)1227 _2022FromGR94DBCS(uint32_t value) {
1228 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1229 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1230 ) {
1231 return value - 0x8080; /* shift down to 21..7e byte range */
1232 } else {
1233 return 0; /* not valid for ISO 2022 */
1234 }
1235 }
1236
1237 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1238 /*
1239 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1240 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1241 * unchanged.
1242 */
1243 static inline uint32_t
1244 _2022ToGR94DBCS(uint32_t value) {
1245 uint32_t returnValue = value + 0x8080;
1246 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1247 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1248 return returnValue;
1249 } else {
1250 return value;
1251 }
1252 }
1253 #endif
1254
1255 #ifdef U_ENABLE_GENERIC_ISO_2022
1256
1257 /**********************************************************************************
1258 * ISO-2022 Converter
1259 *
1260 *
1261 */
1262
1263 static void U_CALLCONV
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1264 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1265 UErrorCode* err){
1266 const char* mySourceLimit, *realSourceLimit;
1267 const char* sourceStart;
1268 const UChar* myTargetStart;
1269 UConverter* saveThis;
1270 UConverterDataISO2022* myData;
1271 int8_t length;
1272
1273 saveThis = args->converter;
1274 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1275
1276 realSourceLimit = args->sourceLimit;
1277 while (args->source < realSourceLimit) {
1278 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1279 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1280 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1281
1282 if(args->source < mySourceLimit) {
1283 if(myData->currentConverter==NULL) {
1284 myData->currentConverter = ucnv_open("ASCII",err);
1285 if(U_FAILURE(*err)){
1286 return;
1287 }
1288
1289 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1290 saveThis->mode = UCNV_SO;
1291 }
1292
1293 /* convert to before the ESC or until the end of the buffer */
1294 myData->isFirstBuffer=FALSE;
1295 sourceStart = args->source;
1296 myTargetStart = args->target;
1297 args->converter = myData->currentConverter;
1298 ucnv_toUnicode(args->converter,
1299 &args->target,
1300 args->targetLimit,
1301 &args->source,
1302 mySourceLimit,
1303 args->offsets,
1304 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1305 err);
1306 args->converter = saveThis;
1307
1308 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1309 /* move the overflow buffer */
1310 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1311 myData->currentConverter->UCharErrorBufferLength = 0;
1312 if(length > 0) {
1313 uprv_memcpy(saveThis->UCharErrorBuffer,
1314 myData->currentConverter->UCharErrorBuffer,
1315 length*U_SIZEOF_UCHAR);
1316 }
1317 return;
1318 }
1319
1320 /*
1321 * At least one of:
1322 * -Error while converting
1323 * -Done with entire buffer
1324 * -Need to write offsets or update the current offset
1325 * (leave that up to the code in ucnv.c)
1326 *
1327 * or else we just stopped at an ESC byte and continue with changeState_2022()
1328 */
1329 if (U_FAILURE(*err) ||
1330 (args->source == realSourceLimit) ||
1331 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1332 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1333 ) {
1334 /* copy partial or error input for truncated detection and error handling */
1335 if(U_FAILURE(*err)) {
1336 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1337 if(length > 0) {
1338 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1339 }
1340 } else {
1341 length = saveThis->toULength = myData->currentConverter->toULength;
1342 if(length > 0) {
1343 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1344 if(args->source < mySourceLimit) {
1345 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1346 }
1347 }
1348 }
1349 return;
1350 }
1351 }
1352 }
1353
1354 sourceStart = args->source;
1355 changeState_2022(args->converter,
1356 &(args->source),
1357 realSourceLimit,
1358 ISO_2022,
1359 err);
1360 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1361 /* let the ucnv.c code update its current offset */
1362 return;
1363 }
1364 }
1365 }
1366
1367 #endif
1368
1369 /*
1370 * To Unicode Callback helper function
1371 */
1372 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1373 toUnicodeCallback(UConverter *cnv,
1374 const uint32_t sourceChar, const uint32_t targetUniChar,
1375 UErrorCode* err){
1376 if(sourceChar>0xff){
1377 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1378 cnv->toUBytes[1] = (uint8_t)sourceChar;
1379 cnv->toULength = 2;
1380 }
1381 else{
1382 cnv->toUBytes[0] =(char) sourceChar;
1383 cnv->toULength = 1;
1384 }
1385
1386 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1387 *err = U_INVALID_CHAR_FOUND;
1388 }
1389 else{
1390 *err = U_ILLEGAL_CHAR_FOUND;
1391 }
1392 }
1393
1394 /**************************************ISO-2022-JP*************************************************/
1395
1396 /************************************** IMPORTANT **************************************************
1397 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1398 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1399 * The converter iterates over each Unicode codepoint
1400 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1401 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1402 * would do as far as possible.
1403 *
1404 * If the implementation of these macros or structure of sharedData struct change in the future, make
1405 * sure that ISO-2022 is also changed.
1406 ***************************************************************************************************
1407 */
1408
1409 /***************************************************************************************************
1410 * Rules for ISO-2022-jp encoding
1411 * (i) Escape sequences must be fully contained within a line they should not
1412 * span new lines or CRs
1413 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1414 * JIS-Roman character escape sequence should follow before the line terminates
1415 * (iii) If the first character on the line is represented by two bytes then a two
1416 * byte character escape sequence should precede it
1417 * (iv) If no escape sequence is encountered then the characters are ASCII
1418 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1419 * and invoked with SS2 (ESC N).
1420 * (vi) If there is any G0 designation in text, there must be a switch to
1421 * ASCII or to JIS X 0201-Roman before a space character (but not
1422 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1423 * characters such as tab or CRLF.
1424 * (vi) Supported encodings:
1425 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1426 *
1427 * source : RFC-1554
1428 *
1429 * JISX201, JISX208,JISX212 : new .cnv data files created
1430 * KSC5601 : alias to ibm-949 mapping table
1431 * GB2312 : alias to ibm-1386 mapping table
1432 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1433 * ISO-8859-7 : alisas to ibm-9409 mapping table
1434 */
1435
1436 /* preference order of JP charsets */
1437 static const StateEnum jpCharsetPref[]={
1438 ASCII,
1439 JISX201,
1440 ISO8859_1,
1441 JISX208,
1442 ISO8859_7,
1443 JISX212,
1444 GB2312,
1445 KSC5601,
1446 HWKANA_7BIT
1447 };
1448
1449 /*
1450 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1451 * not in order of jpCharsetPref[]!
1452 */
1453 static const char escSeqChars[][6] ={
1454 "\x1B\x28\x42", /* <ESC>(B ASCII */
1455 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1456 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1457 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1458 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1459 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1460 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1461 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1462 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1463
1464 };
1465 static const int8_t escSeqCharsLen[] ={
1466 3, /* length of <ESC>(B ASCII */
1467 3, /* length of <ESC>.A ISO-8859-1 */
1468 3, /* length of <ESC>.F ISO-8859-7 */
1469 3, /* length of <ESC>(J JISX-201 */
1470 3, /* length of <ESC>$B JISX-208 */
1471 4, /* length of <ESC>$(D JISX-212 */
1472 3, /* length of <ESC>$A GB2312 */
1473 4, /* length of <ESC>$(C KSC5601 */
1474 3 /* length of <ESC>(I HWKANA_7BIT */
1475 };
1476
1477 /*
1478 * The iteration over various code pages works this way:
1479 * i) Get the currentState from myConverterData->currentState
1480 * ii) Check if the character is mapped to a valid character in the currentState
1481 * Yes -> a) set the initIterState to currentState
1482 * b) remain in this state until an invalid character is found
1483 * No -> a) go to the next code page and find the character
1484 * iii) Before changing the state increment the current state check if the current state
1485 * is equal to the intitIteration state
1486 * Yes -> A character that cannot be represented in any of the supported encodings
1487 * break and return a U_INVALID_CHARACTER error
1488 * No -> Continue and find the character in next code page
1489 *
1490 *
1491 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1492 */
1493
1494 /* Map 00..7F to Unicode according to JIS X 0201. */
1495 static inline uint32_t
jisx201ToU(uint32_t value)1496 jisx201ToU(uint32_t value) {
1497 if(value < 0x5c) {
1498 return value;
1499 } else if(value == 0x5c) {
1500 return 0xa5;
1501 } else if(value == 0x7e) {
1502 return 0x203e;
1503 } else /* value <= 0x7f */ {
1504 return value;
1505 }
1506 }
1507
1508 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1509 static inline uint32_t
jisx201FromU(uint32_t value)1510 jisx201FromU(uint32_t value) {
1511 if(value<=0x7f) {
1512 if(value!=0x5c && value!=0x7e) {
1513 return value;
1514 }
1515 } else if(value==0xa5) {
1516 return 0x5c;
1517 } else if(value==0x203e) {
1518 return 0x7e;
1519 }
1520 return 0xfffe;
1521 }
1522
1523 /*
1524 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1525 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1526 * Return 0 if the byte pair is out of range.
1527 */
1528 static inline uint32_t
_2022FromSJIS(uint32_t value)1529 _2022FromSJIS(uint32_t value) {
1530 uint8_t trail;
1531
1532 if(value > 0xEFFC) {
1533 return 0; /* beyond JIS X 0208 */
1534 }
1535
1536 trail = (uint8_t)value;
1537
1538 value &= 0xff00; /* lead byte */
1539 if(value <= 0x9f00) {
1540 value -= 0x7000;
1541 } else /* 0xe000 <= value <= 0xef00 */ {
1542 value -= 0xb000;
1543 }
1544 value <<= 1;
1545
1546 if(trail <= 0x9e) {
1547 value -= 0x100;
1548 if(trail <= 0x7e) {
1549 value |= trail - 0x1f;
1550 } else {
1551 value |= trail - 0x20;
1552 }
1553 } else /* trail <= 0xfc */ {
1554 value |= trail - 0x7e;
1555 }
1556 return value;
1557 }
1558
1559 /*
1560 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1561 * If either byte is outside 21..7E make sure that the result is not valid
1562 * for Shift-JIS so that the converter catches it.
1563 * Some invalid byte values already turn into equally invalid Shift-JIS
1564 * byte values and need not be tested explicitly.
1565 */
1566 static inline void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1567 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1568 if(c1&1) {
1569 ++c1;
1570 if(c2 <= 0x5f) {
1571 c2 += 0x1f;
1572 } else if(c2 <= 0x7e) {
1573 c2 += 0x20;
1574 } else {
1575 c2 = 0; /* invalid */
1576 }
1577 } else {
1578 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1579 c2 += 0x7e;
1580 } else {
1581 c2 = 0; /* invalid */
1582 }
1583 }
1584 c1 >>= 1;
1585 if(c1 <= 0x2f) {
1586 c1 += 0x70;
1587 } else if(c1 <= 0x3f) {
1588 c1 += 0xb0;
1589 } else {
1590 c1 = 0; /* invalid */
1591 }
1592 bytes[0] = (char)c1;
1593 bytes[1] = (char)c2;
1594 }
1595
1596 /*
1597 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1598 * Katakana.
1599 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1600 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1601 * These were the only fallbacks in ICU's jisx-208.ucm file.
1602 */
1603 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1604 0x2123, /* U+FF61 */
1605 0x2156,
1606 0x2157,
1607 0x2122,
1608 0x2126,
1609 0x2572,
1610 0x2521,
1611 0x2523,
1612 0x2525,
1613 0x2527,
1614 0x2529,
1615 0x2563,
1616 0x2565,
1617 0x2567,
1618 0x2543,
1619 0x213C, /* U+FF70 */
1620 0x2522,
1621 0x2524,
1622 0x2526,
1623 0x2528,
1624 0x252A,
1625 0x252B,
1626 0x252D,
1627 0x252F,
1628 0x2531,
1629 0x2533,
1630 0x2535,
1631 0x2537,
1632 0x2539,
1633 0x253B,
1634 0x253D,
1635 0x253F, /* U+FF80 */
1636 0x2541,
1637 0x2544,
1638 0x2546,
1639 0x2548,
1640 0x254A,
1641 0x254B,
1642 0x254C,
1643 0x254D,
1644 0x254E,
1645 0x254F,
1646 0x2552,
1647 0x2555,
1648 0x2558,
1649 0x255B,
1650 0x255E,
1651 0x255F, /* U+FF90 */
1652 0x2560,
1653 0x2561,
1654 0x2562,
1655 0x2564,
1656 0x2566,
1657 0x2568,
1658 0x2569,
1659 0x256A,
1660 0x256B,
1661 0x256C,
1662 0x256D,
1663 0x256F,
1664 0x2573,
1665 0x212B,
1666 0x212C /* U+FF9F */
1667 };
1668
1669 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1670 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1671 UConverter *cnv = args->converter;
1672 UConverterDataISO2022 *converterData;
1673 ISO2022State *pFromU2022State;
1674 uint8_t *target = (uint8_t *) args->target;
1675 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1676 const UChar* source = args->source;
1677 const UChar* sourceLimit = args->sourceLimit;
1678 int32_t* offsets = args->offsets;
1679 UChar32 sourceChar;
1680 char buffer[8];
1681 int32_t len, outLen;
1682 int8_t choices[10];
1683 int32_t choiceCount;
1684 uint32_t targetValue = 0;
1685 UBool useFallback;
1686
1687 int32_t i;
1688 int8_t cs, g;
1689
1690 /* set up the state */
1691 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1692 pFromU2022State = &converterData->fromU2022State;
1693
1694 choiceCount = 0;
1695
1696 /* check if the last codepoint of previous buffer was a lead surrogate*/
1697 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1698 goto getTrail;
1699 }
1700
1701 while(source < sourceLimit) {
1702 if(target < targetLimit) {
1703
1704 sourceChar = *(source++);
1705 /*check if the char is a First surrogate*/
1706 if(U16_IS_SURROGATE(sourceChar)) {
1707 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1708 getTrail:
1709 /*look ahead to find the trail surrogate*/
1710 if(source < sourceLimit) {
1711 /* test the following code unit */
1712 UChar trail=(UChar) *source;
1713 if(U16_IS_TRAIL(trail)) {
1714 source++;
1715 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1716 cnv->fromUChar32=0x00;
1717 /* convert this supplementary code point */
1718 /* exit this condition tree */
1719 } else {
1720 /* this is an unmatched lead code unit (1st surrogate) */
1721 /* callback(illegal) */
1722 *err=U_ILLEGAL_CHAR_FOUND;
1723 cnv->fromUChar32=sourceChar;
1724 break;
1725 }
1726 } else {
1727 /* no more input */
1728 cnv->fromUChar32=sourceChar;
1729 break;
1730 }
1731 } else {
1732 /* this is an unmatched trail code unit (2nd surrogate) */
1733 /* callback(illegal) */
1734 *err=U_ILLEGAL_CHAR_FOUND;
1735 cnv->fromUChar32=sourceChar;
1736 break;
1737 }
1738 }
1739
1740 /* do not convert SO/SI/ESC */
1741 if(IS_2022_CONTROL(sourceChar)) {
1742 /* callback(illegal) */
1743 *err=U_ILLEGAL_CHAR_FOUND;
1744 cnv->fromUChar32=sourceChar;
1745 break;
1746 }
1747
1748 /* do the conversion */
1749
1750 if(choiceCount == 0) {
1751 uint16_t csm;
1752
1753 /*
1754 * The csm variable keeps track of which charsets are allowed
1755 * and not used yet while building the choices[].
1756 */
1757 csm = jpCharsetMasks[converterData->version];
1758 choiceCount = 0;
1759
1760 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1761 if(converterData->version == 3 || converterData->version == 4) {
1762 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1763 }
1764 /* Do not try single-byte half-width Katakana for other versions. */
1765 csm &= ~CSM(HWKANA_7BIT);
1766
1767 /* try the current G0 charset */
1768 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1769 csm &= ~CSM(cs);
1770
1771 /* try the current G2 charset */
1772 if((cs = pFromU2022State->cs[2]) != 0) {
1773 choices[choiceCount++] = cs;
1774 csm &= ~CSM(cs);
1775 }
1776
1777 /* try all the other possible charsets */
1778 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1779 cs = (int8_t)jpCharsetPref[i];
1780 if(CSM(cs) & csm) {
1781 choices[choiceCount++] = cs;
1782 csm &= ~CSM(cs);
1783 }
1784 }
1785 }
1786
1787 cs = g = 0;
1788 /*
1789 * len==0: no mapping found yet
1790 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1791 * len>0: found a roundtrip result, done
1792 */
1793 len = 0;
1794 /*
1795 * We will turn off useFallback after finding a fallback,
1796 * but we still get fallbacks from PUA code points as usual.
1797 * Therefore, we will also need to check that we don't overwrite
1798 * an early fallback with a later one.
1799 */
1800 useFallback = cnv->useFallback;
1801
1802 for(i = 0; i < choiceCount && len <= 0; ++i) {
1803 uint32_t value;
1804 int32_t len2;
1805 int8_t cs0 = choices[i];
1806 switch(cs0) {
1807 case ASCII:
1808 if(sourceChar <= 0x7f) {
1809 targetValue = (uint32_t)sourceChar;
1810 len = 1;
1811 cs = cs0;
1812 g = 0;
1813 }
1814 break;
1815 case ISO8859_1:
1816 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1817 targetValue = (uint32_t)sourceChar - 0x80;
1818 len = 1;
1819 cs = cs0;
1820 g = 2;
1821 }
1822 break;
1823 case HWKANA_7BIT:
1824 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1825 if(converterData->version==3) {
1826 /* JIS7: use G1 (SO) */
1827 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1828 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1829 len = 1;
1830 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1831 g = 1;
1832 } else if(converterData->version==4) {
1833 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1834 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1835 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1836 len = 1;
1837
1838 cs = pFromU2022State->cs[0];
1839 if(IS_JP_DBCS(cs)) {
1840 /* switch from a DBCS charset to JISX201 */
1841 cs = (int8_t)JISX201;
1842 }
1843 /* else stay in the current G0 charset */
1844 g = 0;
1845 }
1846 /* else do not use HWKANA_7BIT with other versions */
1847 }
1848 break;
1849 case JISX201:
1850 /* G0 SBCS */
1851 value = jisx201FromU(sourceChar);
1852 if(value <= 0x7f) {
1853 targetValue = value;
1854 len = 1;
1855 cs = cs0;
1856 g = 0;
1857 useFallback = FALSE;
1858 }
1859 break;
1860 case JISX208:
1861 /* G0 DBCS from Shift-JIS table */
1862 len2 = MBCS_FROM_UCHAR32_ISO2022(
1863 converterData->myConverterArray[cs0],
1864 sourceChar, &value,
1865 useFallback, MBCS_OUTPUT_2);
1866 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1867 value = _2022FromSJIS(value);
1868 if(value != 0) {
1869 targetValue = value;
1870 len = len2;
1871 cs = cs0;
1872 g = 0;
1873 useFallback = FALSE;
1874 }
1875 } else if(len == 0 && useFallback &&
1876 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1877 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1878 len = -2;
1879 cs = cs0;
1880 g = 0;
1881 useFallback = FALSE;
1882 }
1883 break;
1884 case ISO8859_7:
1885 /* G0 SBCS forced to 7-bit output */
1886 len2 = MBCS_SINGLE_FROM_UCHAR32(
1887 converterData->myConverterArray[cs0],
1888 sourceChar, &value,
1889 useFallback);
1890 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1891 targetValue = value - 0x80;
1892 len = len2;
1893 cs = cs0;
1894 g = 2;
1895 useFallback = FALSE;
1896 }
1897 break;
1898 default:
1899 /* G0 DBCS */
1900 len2 = MBCS_FROM_UCHAR32_ISO2022(
1901 converterData->myConverterArray[cs0],
1902 sourceChar, &value,
1903 useFallback, MBCS_OUTPUT_2);
1904 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1905 if(cs0 == KSC5601) {
1906 /*
1907 * Check for valid bytes for the encoding scheme.
1908 * This is necessary because the sub-converter (windows-949)
1909 * has a broader encoding scheme than is valid for 2022.
1910 */
1911 value = _2022FromGR94DBCS(value);
1912 if(value == 0) {
1913 break;
1914 }
1915 }
1916 targetValue = value;
1917 len = len2;
1918 cs = cs0;
1919 g = 0;
1920 useFallback = FALSE;
1921 }
1922 break;
1923 }
1924 }
1925
1926 if(len != 0) {
1927 if(len < 0) {
1928 len = -len; /* fallback */
1929 }
1930 outLen = 0; /* count output bytes */
1931
1932 /* write SI if necessary (only for JIS7) */
1933 if(pFromU2022State->g == 1 && g == 0) {
1934 buffer[outLen++] = UCNV_SI;
1935 pFromU2022State->g = 0;
1936 }
1937
1938 /* write the designation sequence if necessary */
1939 if(cs != pFromU2022State->cs[g]) {
1940 int32_t escLen = escSeqCharsLen[cs];
1941 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1942 outLen += escLen;
1943 pFromU2022State->cs[g] = cs;
1944
1945 /* invalidate the choices[] */
1946 choiceCount = 0;
1947 }
1948
1949 /* write the shift sequence if necessary */
1950 if(g != pFromU2022State->g) {
1951 switch(g) {
1952 /* case 0 handled before writing escapes */
1953 case 1:
1954 buffer[outLen++] = UCNV_SO;
1955 pFromU2022State->g = 1;
1956 break;
1957 default: /* case 2 */
1958 buffer[outLen++] = 0x1b;
1959 buffer[outLen++] = 0x4e;
1960 break;
1961 /* no case 3: no SS3 in ISO-2022-JP-x */
1962 }
1963 }
1964
1965 /* write the output bytes */
1966 if(len == 1) {
1967 buffer[outLen++] = (char)targetValue;
1968 } else /* len == 2 */ {
1969 buffer[outLen++] = (char)(targetValue >> 8);
1970 buffer[outLen++] = (char)targetValue;
1971 }
1972 } else {
1973 /*
1974 * if we cannot find the character after checking all codepages
1975 * then this is an error
1976 */
1977 *err = U_INVALID_CHAR_FOUND;
1978 cnv->fromUChar32=sourceChar;
1979 break;
1980 }
1981
1982 if(sourceChar == CR || sourceChar == LF) {
1983 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1984 pFromU2022State->cs[2] = 0;
1985 choiceCount = 0;
1986 }
1987
1988 /* output outLen>0 bytes in buffer[] */
1989 if(outLen == 1) {
1990 *target++ = buffer[0];
1991 if(offsets) {
1992 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1993 }
1994 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1995 *target++ = buffer[0];
1996 *target++ = buffer[1];
1997 if(offsets) {
1998 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1999 *offsets++ = sourceIndex;
2000 *offsets++ = sourceIndex;
2001 }
2002 } else {
2003 fromUWriteUInt8(
2004 cnv,
2005 buffer, outLen,
2006 &target, (const char *)targetLimit,
2007 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2008 err);
2009 if(U_FAILURE(*err)) {
2010 break;
2011 }
2012 }
2013 } /* end if(myTargetIndex<myTargetLength) */
2014 else{
2015 *err =U_BUFFER_OVERFLOW_ERROR;
2016 break;
2017 }
2018
2019 }/* end while(mySourceIndex<mySourceLength) */
2020
2021 /*
2022 * the end of the input stream and detection of truncated input
2023 * are handled by the framework, but for ISO-2022-JP conversion
2024 * we need to be in ASCII mode at the very end
2025 *
2026 * conditions:
2027 * successful
2028 * in SO mode or not in ASCII mode
2029 * end of input and no truncated input
2030 */
2031 if( U_SUCCESS(*err) &&
2032 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2033 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2034 ) {
2035 int32_t sourceIndex;
2036
2037 outLen = 0;
2038
2039 if(pFromU2022State->g != 0) {
2040 buffer[outLen++] = UCNV_SI;
2041 pFromU2022State->g = 0;
2042 }
2043
2044 if(pFromU2022State->cs[0] != ASCII) {
2045 int32_t escLen = escSeqCharsLen[ASCII];
2046 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2047 outLen += escLen;
2048 pFromU2022State->cs[0] = (int8_t)ASCII;
2049 }
2050
2051 /* get the source index of the last input character */
2052 /*
2053 * TODO this would be simpler and more reliable if we used a pair
2054 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2055 * so that we could simply use the prevSourceIndex here;
2056 * this code gives an incorrect result for the rare case of an unmatched
2057 * trail surrogate that is alone in the last buffer of the text stream
2058 */
2059 sourceIndex=(int32_t)(source-args->source);
2060 if(sourceIndex>0) {
2061 --sourceIndex;
2062 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2063 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2064 ) {
2065 --sourceIndex;
2066 }
2067 } else {
2068 sourceIndex=-1;
2069 }
2070
2071 fromUWriteUInt8(
2072 cnv,
2073 buffer, outLen,
2074 &target, (const char *)targetLimit,
2075 &offsets, sourceIndex,
2076 err);
2077 }
2078
2079 /*save the state and return */
2080 args->source = source;
2081 args->target = (char*)target;
2082 }
2083
2084 /*************** to unicode *******************/
2085
2086 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2087 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2088 UErrorCode* err){
2089 char tempBuf[2];
2090 const char *mySource = (char *) args->source;
2091 UChar *myTarget = args->target;
2092 const char *mySourceLimit = args->sourceLimit;
2093 uint32_t targetUniChar = 0x0000;
2094 uint32_t mySourceChar = 0x0000;
2095 uint32_t tmpSourceChar = 0x0000;
2096 UConverterDataISO2022* myData;
2097 ISO2022State *pToU2022State;
2098 StateEnum cs;
2099
2100 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2101 pToU2022State = &myData->toU2022State;
2102
2103 if(myData->key != 0) {
2104 /* continue with a partial escape sequence */
2105 goto escape;
2106 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2107 /* continue with a partial double-byte character */
2108 mySourceChar = args->converter->toUBytes[0];
2109 args->converter->toULength = 0;
2110 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2111 targetUniChar = missingCharMarker;
2112 goto getTrailByte;
2113 }
2114
2115 while(mySource < mySourceLimit){
2116
2117 targetUniChar =missingCharMarker;
2118
2119 if(myTarget < args->targetLimit){
2120
2121 mySourceChar= (unsigned char) *mySource++;
2122
2123 switch(mySourceChar) {
2124 case UCNV_SI:
2125 if(myData->version==3) {
2126 pToU2022State->g=0;
2127 continue;
2128 } else {
2129 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2130 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2131 break;
2132 }
2133
2134 case UCNV_SO:
2135 if(myData->version==3) {
2136 /* JIS7: switch to G1 half-width Katakana */
2137 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2138 pToU2022State->g=1;
2139 continue;
2140 } else {
2141 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2142 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2143 break;
2144 }
2145
2146 case ESC_2022:
2147 mySource--;
2148 escape:
2149 {
2150 const char * mySourceBefore = mySource;
2151 int8_t toULengthBefore = args->converter->toULength;
2152
2153 changeState_2022(args->converter,&(mySource),
2154 mySourceLimit, ISO_2022_JP,err);
2155
2156 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2157 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2158 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2159 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2160 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2161 }
2162 }
2163
2164 /* invalid or illegal escape sequence */
2165 if(U_FAILURE(*err)){
2166 args->target = myTarget;
2167 args->source = mySource;
2168 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2169 return;
2170 }
2171 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2172 if(myData->key==0) {
2173 myData->isEmptySegment = TRUE;
2174 }
2175 continue;
2176
2177 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2178
2179 case CR:
2180 case LF:
2181 /* automatically reset to single-byte mode */
2182 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2183 pToU2022State->cs[0] = (int8_t)ASCII;
2184 }
2185 pToU2022State->cs[2] = 0;
2186 pToU2022State->g = 0;
2187 U_FALLTHROUGH;
2188 default:
2189 /* convert one or two bytes */
2190 myData->isEmptySegment = FALSE;
2191 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2192 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2193 !IS_JP_DBCS(cs)
2194 ) {
2195 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2196 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2197
2198 /* return from a single-shift state to the previous one */
2199 if(pToU2022State->g >= 2) {
2200 pToU2022State->g=pToU2022State->prevG;
2201 }
2202 } else switch(cs) {
2203 case ASCII:
2204 if(mySourceChar <= 0x7f) {
2205 targetUniChar = mySourceChar;
2206 }
2207 break;
2208 case ISO8859_1:
2209 if(mySourceChar <= 0x7f) {
2210 targetUniChar = mySourceChar + 0x80;
2211 }
2212 /* return from a single-shift state to the previous one */
2213 pToU2022State->g=pToU2022State->prevG;
2214 break;
2215 case ISO8859_7:
2216 if(mySourceChar <= 0x7f) {
2217 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2218 targetUniChar =
2219 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2220 myData->myConverterArray[cs],
2221 mySourceChar + 0x80);
2222 }
2223 /* return from a single-shift state to the previous one */
2224 pToU2022State->g=pToU2022State->prevG;
2225 break;
2226 case JISX201:
2227 if(mySourceChar <= 0x7f) {
2228 targetUniChar = jisx201ToU(mySourceChar);
2229 }
2230 break;
2231 case HWKANA_7BIT:
2232 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2233 /* 7-bit halfwidth Katakana */
2234 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2235 }
2236 break;
2237 default:
2238 /* G0 DBCS */
2239 if(mySource < mySourceLimit) {
2240 int leadIsOk, trailIsOk;
2241 uint8_t trailByte;
2242 getTrailByte:
2243 trailByte = (uint8_t)*mySource;
2244 /*
2245 * Ticket 5691: consistent illegal sequences:
2246 * - We include at least the first byte in the illegal sequence.
2247 * - If any of the non-initial bytes could be the start of a character,
2248 * we stop the illegal sequence before the first one of those.
2249 *
2250 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2251 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2252 * Otherwise we convert or report the pair of bytes.
2253 */
2254 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2255 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2256 if (leadIsOk && trailIsOk) {
2257 ++mySource;
2258 tmpSourceChar = (mySourceChar << 8) | trailByte;
2259 if(cs == JISX208) {
2260 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2261 mySourceChar = tmpSourceChar;
2262 } else {
2263 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2264 mySourceChar = tmpSourceChar;
2265 if (cs == KSC5601) {
2266 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2267 }
2268 tempBuf[0] = (char)(tmpSourceChar >> 8);
2269 tempBuf[1] = (char)(tmpSourceChar);
2270 }
2271 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2272 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2273 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2274 ++mySource;
2275 /* add another bit so that the code below writes 2 bytes in case of error */
2276 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2277 }
2278 } else {
2279 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2280 args->converter->toULength = 1;
2281 goto endloop;
2282 }
2283 } /* End of inner switch */
2284 break;
2285 } /* End of outer switch */
2286 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2287 if(args->offsets){
2288 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2289 }
2290 *(myTarget++)=(UChar)targetUniChar;
2291 }
2292 else if(targetUniChar > missingCharMarker){
2293 /* disassemble the surrogate pair and write to output*/
2294 targetUniChar-=0x0010000;
2295 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2296 if(args->offsets){
2297 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2298 }
2299 ++myTarget;
2300 if(myTarget< args->targetLimit){
2301 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2302 if(args->offsets){
2303 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2304 }
2305 ++myTarget;
2306 }else{
2307 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2308 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2309 }
2310
2311 }
2312 else{
2313 /* Call the callback function*/
2314 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2315 break;
2316 }
2317 }
2318 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2319 *err =U_BUFFER_OVERFLOW_ERROR;
2320 break;
2321 }
2322 }
2323 endloop:
2324 args->target = myTarget;
2325 args->source = mySource;
2326 }
2327
2328
2329 #if !UCONFIG_ONLY_HTML_CONVERSION
2330 /***************************************************************
2331 * Rules for ISO-2022-KR encoding
2332 * i) The KSC5601 designator sequence should appear only once in a file,
2333 * at the begining of a line before any KSC5601 characters. This usually
2334 * means that it appears by itself on the first line of the file
2335 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2336 * and SI to shift into single byte mode
2337 */
2338 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2339 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2340
2341 UConverter* saveConv = args->converter;
2342 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2343 args->converter=myConverterData->currentConverter;
2344
2345 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2346 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2347 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2348
2349 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2350 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2351 uprv_memcpy(
2352 saveConv->charErrorBuffer,
2353 myConverterData->currentConverter->charErrorBuffer,
2354 myConverterData->currentConverter->charErrorBufferLength);
2355 }
2356 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2357 myConverterData->currentConverter->charErrorBufferLength = 0;
2358 }
2359 args->converter=saveConv;
2360 }
2361
2362 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2363 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2364
2365 const UChar *source = args->source;
2366 const UChar *sourceLimit = args->sourceLimit;
2367 unsigned char *target = (unsigned char *) args->target;
2368 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2369 int32_t* offsets = args->offsets;
2370 uint32_t targetByteUnit = 0x0000;
2371 UChar32 sourceChar = 0x0000;
2372 UBool isTargetByteDBCS;
2373 UBool oldIsTargetByteDBCS;
2374 UConverterDataISO2022 *converterData;
2375 UConverterSharedData* sharedData;
2376 UBool useFallback;
2377 int32_t length =0;
2378
2379 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2380 /* if the version is 1 then the user is requesting
2381 * conversion with ibm-25546 pass the arguments to
2382 * MBCS converter and return
2383 */
2384 if(converterData->version==1){
2385 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2386 return;
2387 }
2388
2389 /* initialize data */
2390 sharedData = converterData->currentConverter->sharedData;
2391 useFallback = args->converter->useFallback;
2392 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2393 oldIsTargetByteDBCS = isTargetByteDBCS;
2394
2395 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2396 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2397 goto getTrail;
2398 }
2399 while(source < sourceLimit){
2400
2401 targetByteUnit = missingCharMarker;
2402
2403 if(target < (unsigned char*) args->targetLimit){
2404 sourceChar = *source++;
2405
2406 /* do not convert SO/SI/ESC */
2407 if(IS_2022_CONTROL(sourceChar)) {
2408 /* callback(illegal) */
2409 *err=U_ILLEGAL_CHAR_FOUND;
2410 args->converter->fromUChar32=sourceChar;
2411 break;
2412 }
2413
2414 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2415 if(length < 0) {
2416 length = -length; /* fallback */
2417 }
2418 /* only DBCS or SBCS characters are expected*/
2419 /* DB characters with high bit set to 1 are expected */
2420 if( length > 2 || length==0 ||
2421 (length == 1 && targetByteUnit > 0x7f) ||
2422 (length == 2 &&
2423 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2424 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2425 ) {
2426 targetByteUnit=missingCharMarker;
2427 }
2428 if (targetByteUnit != missingCharMarker){
2429
2430 oldIsTargetByteDBCS = isTargetByteDBCS;
2431 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2432 /* append the shift sequence */
2433 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2434
2435 if (isTargetByteDBCS)
2436 *target++ = UCNV_SO;
2437 else
2438 *target++ = UCNV_SI;
2439 if(offsets)
2440 *(offsets++) = (int32_t)(source - args->source-1);
2441 }
2442 /* write the targetUniChar to target */
2443 if(targetByteUnit <= 0x00FF){
2444 if( target < targetLimit){
2445 *(target++) = (unsigned char) targetByteUnit;
2446 if(offsets){
2447 *(offsets++) = (int32_t)(source - args->source-1);
2448 }
2449
2450 }else{
2451 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2452 *err = U_BUFFER_OVERFLOW_ERROR;
2453 }
2454 }else{
2455 if(target < targetLimit){
2456 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2457 if(offsets){
2458 *(offsets++) = (int32_t)(source - args->source-1);
2459 }
2460 if(target < targetLimit){
2461 *(target++) =(unsigned char) (targetByteUnit -0x80);
2462 if(offsets){
2463 *(offsets++) = (int32_t)(source - args->source-1);
2464 }
2465 }else{
2466 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2467 *err = U_BUFFER_OVERFLOW_ERROR;
2468 }
2469 }else{
2470 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2471 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2472 *err = U_BUFFER_OVERFLOW_ERROR;
2473 }
2474 }
2475
2476 }
2477 else{
2478 /* oops.. the code point is unassingned
2479 * set the error and reason
2480 */
2481
2482 /*check if the char is a First surrogate*/
2483 if(U16_IS_SURROGATE(sourceChar)) {
2484 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2485 getTrail:
2486 /*look ahead to find the trail surrogate*/
2487 if(source < sourceLimit) {
2488 /* test the following code unit */
2489 UChar trail=(UChar) *source;
2490 if(U16_IS_TRAIL(trail)) {
2491 source++;
2492 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2493 *err = U_INVALID_CHAR_FOUND;
2494 /* convert this surrogate code point */
2495 /* exit this condition tree */
2496 } else {
2497 /* this is an unmatched lead code unit (1st surrogate) */
2498 /* callback(illegal) */
2499 *err=U_ILLEGAL_CHAR_FOUND;
2500 }
2501 } else {
2502 /* no more input */
2503 *err = U_ZERO_ERROR;
2504 }
2505 } else {
2506 /* this is an unmatched trail code unit (2nd surrogate) */
2507 /* callback(illegal) */
2508 *err=U_ILLEGAL_CHAR_FOUND;
2509 }
2510 } else {
2511 /* callback(unassigned) for a BMP code point */
2512 *err = U_INVALID_CHAR_FOUND;
2513 }
2514
2515 args->converter->fromUChar32=sourceChar;
2516 break;
2517 }
2518 } /* end if(myTargetIndex<myTargetLength) */
2519 else{
2520 *err =U_BUFFER_OVERFLOW_ERROR;
2521 break;
2522 }
2523
2524 }/* end while(mySourceIndex<mySourceLength) */
2525
2526 /*
2527 * the end of the input stream and detection of truncated input
2528 * are handled by the framework, but for ISO-2022-KR conversion
2529 * we need to be in ASCII mode at the very end
2530 *
2531 * conditions:
2532 * successful
2533 * not in ASCII mode
2534 * end of input and no truncated input
2535 */
2536 if( U_SUCCESS(*err) &&
2537 isTargetByteDBCS &&
2538 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2539 ) {
2540 int32_t sourceIndex;
2541
2542 /* we are switching to ASCII */
2543 isTargetByteDBCS=FALSE;
2544
2545 /* get the source index of the last input character */
2546 /*
2547 * TODO this would be simpler and more reliable if we used a pair
2548 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2549 * so that we could simply use the prevSourceIndex here;
2550 * this code gives an incorrect result for the rare case of an unmatched
2551 * trail surrogate that is alone in the last buffer of the text stream
2552 */
2553 sourceIndex=(int32_t)(source-args->source);
2554 if(sourceIndex>0) {
2555 --sourceIndex;
2556 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2557 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2558 ) {
2559 --sourceIndex;
2560 }
2561 } else {
2562 sourceIndex=-1;
2563 }
2564
2565 fromUWriteUInt8(
2566 args->converter,
2567 SHIFT_IN_STR, 1,
2568 &target, (const char *)targetLimit,
2569 &offsets, sourceIndex,
2570 err);
2571 }
2572
2573 /*save the state and return */
2574 args->source = source;
2575 args->target = (char*)target;
2576 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2577 }
2578
2579 /************************ To Unicode ***************************************/
2580
2581 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2582 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2583 UErrorCode* err){
2584 char const* sourceStart;
2585 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2586
2587 UConverterToUnicodeArgs subArgs;
2588 int32_t minArgsSize;
2589
2590 /* set up the subconverter arguments */
2591 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2592 minArgsSize = args->size;
2593 } else {
2594 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2595 }
2596
2597 uprv_memcpy(&subArgs, args, minArgsSize);
2598 subArgs.size = (uint16_t)minArgsSize;
2599 subArgs.converter = myData->currentConverter;
2600
2601 /* remember the original start of the input for offsets */
2602 sourceStart = args->source;
2603
2604 if(myData->key != 0) {
2605 /* continue with a partial escape sequence */
2606 goto escape;
2607 }
2608
2609 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2610 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2611 subArgs.source = args->source;
2612 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2613 if(subArgs.source != subArgs.sourceLimit) {
2614 /*
2615 * get the current partial byte sequence
2616 *
2617 * it needs to be moved between the public and the subconverter
2618 * so that the conversion framework, which only sees the public
2619 * converter, can handle truncated and illegal input etc.
2620 */
2621 if(args->converter->toULength > 0) {
2622 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2623 }
2624 subArgs.converter->toULength = args->converter->toULength;
2625
2626 /*
2627 * Convert up to the end of the input, or to before the next escape character.
2628 * Does not handle conversion extensions because the preToU[] state etc.
2629 * is not copied.
2630 */
2631 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2632
2633 if(args->offsets != NULL && sourceStart != args->source) {
2634 /* update offsets to base them on the actual start of the input */
2635 int32_t *offsets = args->offsets;
2636 UChar *target = args->target;
2637 int32_t delta = (int32_t)(args->source - sourceStart);
2638 while(target < subArgs.target) {
2639 if(*offsets >= 0) {
2640 *offsets += delta;
2641 }
2642 ++offsets;
2643 ++target;
2644 }
2645 }
2646 args->source = subArgs.source;
2647 args->target = subArgs.target;
2648 args->offsets = subArgs.offsets;
2649
2650 /* copy input/error/overflow buffers */
2651 if(subArgs.converter->toULength > 0) {
2652 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2653 }
2654 args->converter->toULength = subArgs.converter->toULength;
2655
2656 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2657 if(subArgs.converter->UCharErrorBufferLength > 0) {
2658 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2659 subArgs.converter->UCharErrorBufferLength);
2660 }
2661 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2662 subArgs.converter->UCharErrorBufferLength = 0;
2663 }
2664 }
2665
2666 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2667 return;
2668 }
2669
2670 escape:
2671 changeState_2022(args->converter,
2672 &(args->source),
2673 args->sourceLimit,
2674 ISO_2022_KR,
2675 err);
2676 }
2677 }
2678
2679 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2680 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2681 UErrorCode* err){
2682 char tempBuf[2];
2683 const char *mySource = ( char *) args->source;
2684 UChar *myTarget = args->target;
2685 const char *mySourceLimit = args->sourceLimit;
2686 UChar32 targetUniChar = 0x0000;
2687 UChar mySourceChar = 0x0000;
2688 UConverterDataISO2022* myData;
2689 UConverterSharedData* sharedData ;
2690 UBool useFallback;
2691
2692 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2693 if(myData->version==1){
2694 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2695 return;
2696 }
2697
2698 /* initialize state */
2699 sharedData = myData->currentConverter->sharedData;
2700 useFallback = args->converter->useFallback;
2701
2702 if(myData->key != 0) {
2703 /* continue with a partial escape sequence */
2704 goto escape;
2705 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2706 /* continue with a partial double-byte character */
2707 mySourceChar = args->converter->toUBytes[0];
2708 args->converter->toULength = 0;
2709 goto getTrailByte;
2710 }
2711
2712 while(mySource< mySourceLimit){
2713
2714 if(myTarget < args->targetLimit){
2715
2716 mySourceChar= (unsigned char) *mySource++;
2717
2718 if(mySourceChar==UCNV_SI){
2719 myData->toU2022State.g = 0;
2720 if (myData->isEmptySegment) {
2721 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2722 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2723 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2724 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2725 args->converter->toULength = 1;
2726 args->target = myTarget;
2727 args->source = mySource;
2728 return;
2729 }
2730 /*consume the source */
2731 continue;
2732 }else if(mySourceChar==UCNV_SO){
2733 myData->toU2022State.g = 1;
2734 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2735 /*consume the source */
2736 continue;
2737 }else if(mySourceChar==ESC_2022){
2738 mySource--;
2739 escape:
2740 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2741 changeState_2022(args->converter,&(mySource),
2742 mySourceLimit, ISO_2022_KR, err);
2743 if(U_FAILURE(*err)){
2744 args->target = myTarget;
2745 args->source = mySource;
2746 return;
2747 }
2748 continue;
2749 }
2750
2751 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2752 if(myData->toU2022State.g == 1) {
2753 if(mySource < mySourceLimit) {
2754 int leadIsOk, trailIsOk;
2755 uint8_t trailByte;
2756 getTrailByte:
2757 targetUniChar = missingCharMarker;
2758 trailByte = (uint8_t)*mySource;
2759 /*
2760 * Ticket 5691: consistent illegal sequences:
2761 * - We include at least the first byte in the illegal sequence.
2762 * - If any of the non-initial bytes could be the start of a character,
2763 * we stop the illegal sequence before the first one of those.
2764 *
2765 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2766 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2767 * Otherwise we convert or report the pair of bytes.
2768 */
2769 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2770 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2771 if (leadIsOk && trailIsOk) {
2772 ++mySource;
2773 tempBuf[0] = (char)(mySourceChar + 0x80);
2774 tempBuf[1] = (char)(trailByte + 0x80);
2775 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2776 mySourceChar = (mySourceChar << 8) | trailByte;
2777 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2778 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2779 ++mySource;
2780 /* add another bit so that the code below writes 2 bytes in case of error */
2781 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2782 }
2783 } else {
2784 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2785 args->converter->toULength = 1;
2786 break;
2787 }
2788 }
2789 else if(mySourceChar <= 0x7f) {
2790 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2791 } else {
2792 targetUniChar = 0xffff;
2793 }
2794 if(targetUniChar < 0xfffe){
2795 if(args->offsets) {
2796 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2797 }
2798 *(myTarget++)=(UChar)targetUniChar;
2799 }
2800 else {
2801 /* Call the callback function*/
2802 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2803 break;
2804 }
2805 }
2806 else{
2807 *err =U_BUFFER_OVERFLOW_ERROR;
2808 break;
2809 }
2810 }
2811 args->target = myTarget;
2812 args->source = mySource;
2813 }
2814
2815 /*************************** END ISO2022-KR *********************************/
2816
2817 /*************************** ISO-2022-CN *********************************
2818 *
2819 * Rules for ISO-2022-CN Encoding:
2820 * i) The designator sequence must appear once on a line before any instance
2821 * of character set it designates.
2822 * ii) If two lines contain characters from the same character set, both lines
2823 * must include the designator sequence.
2824 * iii) Once the designator sequence is known, a shifting sequence has to be found
2825 * to invoke the shifting
2826 * iv) All lines start in ASCII and end in ASCII.
2827 * v) Four shifting sequences are employed for this purpose:
2828 *
2829 * Sequcence ASCII Eq Charsets
2830 * ---------- ------- ---------
2831 * SI <SI> US-ASCII
2832 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2833 * SS2 <ESC>N CNS-11643-1992 Plane 2
2834 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2835 *
2836 * vi)
2837 * SOdesignator : ESC "$" ")" finalchar_for_SO
2838 * SS2designator : ESC "$" "*" finalchar_for_SS2
2839 * SS3designator : ESC "$" "+" finalchar_for_SS3
2840 *
2841 * ESC $ ) A Indicates the bytes following SO are Chinese
2842 * characters as defined in GB 2312-80, until
2843 * another SOdesignation appears
2844 *
2845 *
2846 * ESC $ ) E Indicates the bytes following SO are as defined
2847 * in ISO-IR-165 (for details, see section 2.1),
2848 * until another SOdesignation appears
2849 *
2850 * ESC $ ) G Indicates the bytes following SO are as defined
2851 * in CNS 11643-plane-1, until another
2852 * SOdesignation appears
2853 *
2854 * ESC $ * H Indicates the two bytes immediately following
2855 * SS2 is a Chinese character as defined in CNS
2856 * 11643-plane-2, until another SS2designation
2857 * appears
2858 * (Meaning <ESC>N must preceed every 2 byte
2859 * sequence.)
2860 *
2861 * ESC $ + I Indicates the immediate two bytes following SS3
2862 * is a Chinese character as defined in CNS
2863 * 11643-plane-3, until another SS3designation
2864 * appears
2865 * (Meaning <ESC>O must preceed every 2 byte
2866 * sequence.)
2867 *
2868 * ESC $ + J Indicates the immediate two bytes following SS3
2869 * is a Chinese character as defined in CNS
2870 * 11643-plane-4, until another SS3designation
2871 * appears
2872 * (In English: <ESC>O must preceed every 2 byte
2873 * sequence.)
2874 *
2875 * ESC $ + K Indicates the immediate two bytes following SS3
2876 * is a Chinese character as defined in CNS
2877 * 11643-plane-5, until another SS3designation
2878 * appears
2879 *
2880 * ESC $ + L Indicates the immediate two bytes following SS3
2881 * is a Chinese character as defined in CNS
2882 * 11643-plane-6, until another SS3designation
2883 * appears
2884 *
2885 * ESC $ + M Indicates the immediate two bytes following SS3
2886 * is a Chinese character as defined in CNS
2887 * 11643-plane-7, until another SS3designation
2888 * appears
2889 *
2890 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2891 * has its own designation information before any Chinese characters
2892 * appear
2893 *
2894 */
2895
2896 /* The following are defined this way to make the strings truly readonly */
2897 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2898 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2899 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2900 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2901 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2902 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2903 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2904 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2905 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2906
2907 /********************** ISO2022-CN Data **************************/
2908 static const char* const escSeqCharsCN[10] ={
2909 SHIFT_IN_STR, /* 0 ASCII */
2910 GB_2312_80_STR, /* 1 GB2312_1 */
2911 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2912 CNS_11643_1992_Plane_1_STR,
2913 CNS_11643_1992_Plane_2_STR,
2914 CNS_11643_1992_Plane_3_STR,
2915 CNS_11643_1992_Plane_4_STR,
2916 CNS_11643_1992_Plane_5_STR,
2917 CNS_11643_1992_Plane_6_STR,
2918 CNS_11643_1992_Plane_7_STR
2919 };
2920
2921 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2922 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2923 UConverter *cnv = args->converter;
2924 UConverterDataISO2022 *converterData;
2925 ISO2022State *pFromU2022State;
2926 uint8_t *target = (uint8_t *) args->target;
2927 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2928 const UChar* source = args->source;
2929 const UChar* sourceLimit = args->sourceLimit;
2930 int32_t* offsets = args->offsets;
2931 UChar32 sourceChar;
2932 char buffer[8];
2933 int32_t len;
2934 int8_t choices[3];
2935 int32_t choiceCount;
2936 uint32_t targetValue = 0;
2937 UBool useFallback;
2938
2939 /* set up the state */
2940 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2941 pFromU2022State = &converterData->fromU2022State;
2942
2943 choiceCount = 0;
2944
2945 /* check if the last codepoint of previous buffer was a lead surrogate*/
2946 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2947 goto getTrail;
2948 }
2949
2950 while( source < sourceLimit){
2951 if(target < targetLimit){
2952
2953 sourceChar = *(source++);
2954 /*check if the char is a First surrogate*/
2955 if(U16_IS_SURROGATE(sourceChar)) {
2956 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2957 getTrail:
2958 /*look ahead to find the trail surrogate*/
2959 if(source < sourceLimit) {
2960 /* test the following code unit */
2961 UChar trail=(UChar) *source;
2962 if(U16_IS_TRAIL(trail)) {
2963 source++;
2964 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2965 cnv->fromUChar32=0x00;
2966 /* convert this supplementary code point */
2967 /* exit this condition tree */
2968 } else {
2969 /* this is an unmatched lead code unit (1st surrogate) */
2970 /* callback(illegal) */
2971 *err=U_ILLEGAL_CHAR_FOUND;
2972 cnv->fromUChar32=sourceChar;
2973 break;
2974 }
2975 } else {
2976 /* no more input */
2977 cnv->fromUChar32=sourceChar;
2978 break;
2979 }
2980 } else {
2981 /* this is an unmatched trail code unit (2nd surrogate) */
2982 /* callback(illegal) */
2983 *err=U_ILLEGAL_CHAR_FOUND;
2984 cnv->fromUChar32=sourceChar;
2985 break;
2986 }
2987 }
2988
2989 /* do the conversion */
2990 if(sourceChar <= 0x007f ){
2991 /* do not convert SO/SI/ESC */
2992 if(IS_2022_CONTROL(sourceChar)) {
2993 /* callback(illegal) */
2994 *err=U_ILLEGAL_CHAR_FOUND;
2995 cnv->fromUChar32=sourceChar;
2996 break;
2997 }
2998
2999 /* US-ASCII */
3000 if(pFromU2022State->g == 0) {
3001 buffer[0] = (char)sourceChar;
3002 len = 1;
3003 } else {
3004 buffer[0] = UCNV_SI;
3005 buffer[1] = (char)sourceChar;
3006 len = 2;
3007 pFromU2022State->g = 0;
3008 choiceCount = 0;
3009 }
3010 if(sourceChar == CR || sourceChar == LF) {
3011 /* reset the state at the end of a line */
3012 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3013 choiceCount = 0;
3014 }
3015 }
3016 else{
3017 /* convert U+0080..U+10ffff */
3018 int32_t i;
3019 int8_t cs, g;
3020
3021 if(choiceCount == 0) {
3022 /* try the current SO/G1 converter first */
3023 choices[0] = pFromU2022State->cs[1];
3024
3025 /* default to GB2312_1 if none is designated yet */
3026 if(choices[0] == 0) {
3027 choices[0] = GB2312_1;
3028 }
3029
3030 if(converterData->version == 0) {
3031 /* ISO-2022-CN */
3032
3033 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3034 if(choices[0] == GB2312_1) {
3035 choices[1] = (int8_t)CNS_11643_1;
3036 } else {
3037 choices[1] = (int8_t)GB2312_1;
3038 }
3039
3040 choiceCount = 2;
3041 } else if (converterData->version == 1) {
3042 /* ISO-2022-CN-EXT */
3043
3044 /* try one of the other converters */
3045 switch(choices[0]) {
3046 case GB2312_1:
3047 choices[1] = (int8_t)CNS_11643_1;
3048 choices[2] = (int8_t)ISO_IR_165;
3049 break;
3050 case ISO_IR_165:
3051 choices[1] = (int8_t)GB2312_1;
3052 choices[2] = (int8_t)CNS_11643_1;
3053 break;
3054 default: /* CNS_11643_x */
3055 choices[1] = (int8_t)GB2312_1;
3056 choices[2] = (int8_t)ISO_IR_165;
3057 break;
3058 }
3059
3060 choiceCount = 3;
3061 } else {
3062 choices[0] = (int8_t)CNS_11643_1;
3063 choices[1] = (int8_t)GB2312_1;
3064 }
3065 }
3066
3067 cs = g = 0;
3068 /*
3069 * len==0: no mapping found yet
3070 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3071 * len>0: found a roundtrip result, done
3072 */
3073 len = 0;
3074 /*
3075 * We will turn off useFallback after finding a fallback,
3076 * but we still get fallbacks from PUA code points as usual.
3077 * Therefore, we will also need to check that we don't overwrite
3078 * an early fallback with a later one.
3079 */
3080 useFallback = cnv->useFallback;
3081
3082 for(i = 0; i < choiceCount && len <= 0; ++i) {
3083 int8_t cs0 = choices[i];
3084 if(cs0 > 0) {
3085 uint32_t value;
3086 int32_t len2;
3087 if(cs0 >= CNS_11643_0) {
3088 len2 = MBCS_FROM_UCHAR32_ISO2022(
3089 converterData->myConverterArray[CNS_11643],
3090 sourceChar,
3091 &value,
3092 useFallback,
3093 MBCS_OUTPUT_3);
3094 if(len2 == 3 || (len2 == -3 && len == 0)) {
3095 targetValue = value;
3096 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3097 if(len2 >= 0) {
3098 len = 2;
3099 } else {
3100 len = -2;
3101 useFallback = FALSE;
3102 }
3103 if(cs == CNS_11643_1) {
3104 g = 1;
3105 } else if(cs == CNS_11643_2) {
3106 g = 2;
3107 } else /* plane 3..7 */ if(converterData->version == 1) {
3108 g = 3;
3109 } else {
3110 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3111 len = 0;
3112 }
3113 }
3114 } else {
3115 /* GB2312_1 or ISO-IR-165 */
3116 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3117 len2 = MBCS_FROM_UCHAR32_ISO2022(
3118 converterData->myConverterArray[cs0],
3119 sourceChar,
3120 &value,
3121 useFallback,
3122 MBCS_OUTPUT_2);
3123 if(len2 == 2 || (len2 == -2 && len == 0)) {
3124 targetValue = value;
3125 len = len2;
3126 cs = cs0;
3127 g = 1;
3128 useFallback = FALSE;
3129 }
3130 }
3131 }
3132 }
3133
3134 if(len != 0) {
3135 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3136
3137 /* write the designation sequence if necessary */
3138 if(cs != pFromU2022State->cs[g]) {
3139 if(cs < CNS_11643) {
3140 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3141 } else {
3142 U_ASSERT(cs >= CNS_11643_1);
3143 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3144 }
3145 len = 4;
3146 pFromU2022State->cs[g] = cs;
3147 if(g == 1) {
3148 /* changing the SO/G1 charset invalidates the choices[] */
3149 choiceCount = 0;
3150 }
3151 }
3152
3153 /* write the shift sequence if necessary */
3154 if(g != pFromU2022State->g) {
3155 switch(g) {
3156 case 1:
3157 buffer[len++] = UCNV_SO;
3158
3159 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3160 pFromU2022State->g = 1;
3161 break;
3162 case 2:
3163 buffer[len++] = 0x1b;
3164 buffer[len++] = 0x4e;
3165 break;
3166 default: /* case 3 */
3167 buffer[len++] = 0x1b;
3168 buffer[len++] = 0x4f;
3169 break;
3170 }
3171 }
3172
3173 /* write the two output bytes */
3174 buffer[len++] = (char)(targetValue >> 8);
3175 buffer[len++] = (char)targetValue;
3176 } else {
3177 /* if we cannot find the character after checking all codepages
3178 * then this is an error
3179 */
3180 *err = U_INVALID_CHAR_FOUND;
3181 cnv->fromUChar32=sourceChar;
3182 break;
3183 }
3184 }
3185
3186 /* output len>0 bytes in buffer[] */
3187 if(len == 1) {
3188 *target++ = buffer[0];
3189 if(offsets) {
3190 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3191 }
3192 } else if(len == 2 && (target + 2) <= targetLimit) {
3193 *target++ = buffer[0];
3194 *target++ = buffer[1];
3195 if(offsets) {
3196 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3197 *offsets++ = sourceIndex;
3198 *offsets++ = sourceIndex;
3199 }
3200 } else {
3201 fromUWriteUInt8(
3202 cnv,
3203 buffer, len,
3204 &target, (const char *)targetLimit,
3205 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3206 err);
3207 if(U_FAILURE(*err)) {
3208 break;
3209 }
3210 }
3211 } /* end if(myTargetIndex<myTargetLength) */
3212 else{
3213 *err =U_BUFFER_OVERFLOW_ERROR;
3214 break;
3215 }
3216
3217 }/* end while(mySourceIndex<mySourceLength) */
3218
3219 /*
3220 * the end of the input stream and detection of truncated input
3221 * are handled by the framework, but for ISO-2022-CN conversion
3222 * we need to be in ASCII mode at the very end
3223 *
3224 * conditions:
3225 * successful
3226 * not in ASCII mode
3227 * end of input and no truncated input
3228 */
3229 if( U_SUCCESS(*err) &&
3230 pFromU2022State->g!=0 &&
3231 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3232 ) {
3233 int32_t sourceIndex;
3234
3235 /* we are switching to ASCII */
3236 pFromU2022State->g=0;
3237
3238 /* get the source index of the last input character */
3239 /*
3240 * TODO this would be simpler and more reliable if we used a pair
3241 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3242 * so that we could simply use the prevSourceIndex here;
3243 * this code gives an incorrect result for the rare case of an unmatched
3244 * trail surrogate that is alone in the last buffer of the text stream
3245 */
3246 sourceIndex=(int32_t)(source-args->source);
3247 if(sourceIndex>0) {
3248 --sourceIndex;
3249 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3250 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3251 ) {
3252 --sourceIndex;
3253 }
3254 } else {
3255 sourceIndex=-1;
3256 }
3257
3258 fromUWriteUInt8(
3259 cnv,
3260 SHIFT_IN_STR, 1,
3261 &target, (const char *)targetLimit,
3262 &offsets, sourceIndex,
3263 err);
3264 }
3265
3266 /*save the state and return */
3267 args->source = source;
3268 args->target = (char*)target;
3269 }
3270
3271
3272 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3273 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3274 UErrorCode* err){
3275 char tempBuf[3];
3276 const char *mySource = (char *) args->source;
3277 UChar *myTarget = args->target;
3278 const char *mySourceLimit = args->sourceLimit;
3279 uint32_t targetUniChar = 0x0000;
3280 uint32_t mySourceChar = 0x0000;
3281 UConverterDataISO2022* myData;
3282 ISO2022State *pToU2022State;
3283
3284 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3285 pToU2022State = &myData->toU2022State;
3286
3287 if(myData->key != 0) {
3288 /* continue with a partial escape sequence */
3289 goto escape;
3290 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3291 /* continue with a partial double-byte character */
3292 mySourceChar = args->converter->toUBytes[0];
3293 args->converter->toULength = 0;
3294 targetUniChar = missingCharMarker;
3295 goto getTrailByte;
3296 }
3297
3298 while(mySource < mySourceLimit){
3299
3300 targetUniChar =missingCharMarker;
3301
3302 if(myTarget < args->targetLimit){
3303
3304 mySourceChar= (unsigned char) *mySource++;
3305
3306 switch(mySourceChar){
3307 case UCNV_SI:
3308 pToU2022State->g=0;
3309 if (myData->isEmptySegment) {
3310 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3311 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3312 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3313 args->converter->toUBytes[0] = mySourceChar;
3314 args->converter->toULength = 1;
3315 args->target = myTarget;
3316 args->source = mySource;
3317 return;
3318 }
3319 continue;
3320
3321 case UCNV_SO:
3322 if(pToU2022State->cs[1] != 0) {
3323 pToU2022State->g=1;
3324 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3325 continue;
3326 } else {
3327 /* illegal to have SO before a matching designator */
3328 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3329 break;
3330 }
3331
3332 case ESC_2022:
3333 mySource--;
3334 escape:
3335 {
3336 const char * mySourceBefore = mySource;
3337 int8_t toULengthBefore = args->converter->toULength;
3338
3339 changeState_2022(args->converter,&(mySource),
3340 mySourceLimit, ISO_2022_CN,err);
3341
3342 /* After SO there must be at least one character before a designator (designator error handled separately) */
3343 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3344 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3345 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3346 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3347 }
3348 }
3349
3350 /* invalid or illegal escape sequence */
3351 if(U_FAILURE(*err)){
3352 args->target = myTarget;
3353 args->source = mySource;
3354 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3355 return;
3356 }
3357 continue;
3358
3359 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3360
3361 case CR:
3362 case LF:
3363 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3364 U_FALLTHROUGH;
3365 default:
3366 /* convert one or two bytes */
3367 myData->isEmptySegment = FALSE;
3368 if(pToU2022State->g != 0) {
3369 if(mySource < mySourceLimit) {
3370 UConverterSharedData *cnv;
3371 StateEnum tempState;
3372 int32_t tempBufLen;
3373 int leadIsOk, trailIsOk;
3374 uint8_t trailByte;
3375 getTrailByte:
3376 trailByte = (uint8_t)*mySource;
3377 /*
3378 * Ticket 5691: consistent illegal sequences:
3379 * - We include at least the first byte in the illegal sequence.
3380 * - If any of the non-initial bytes could be the start of a character,
3381 * we stop the illegal sequence before the first one of those.
3382 *
3383 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3384 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3385 * Otherwise we convert or report the pair of bytes.
3386 */
3387 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3388 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3389 if (leadIsOk && trailIsOk) {
3390 ++mySource;
3391 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3392 if(tempState >= CNS_11643_0) {
3393 cnv = myData->myConverterArray[CNS_11643];
3394 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3395 tempBuf[1] = (char) (mySourceChar);
3396 tempBuf[2] = (char) trailByte;
3397 tempBufLen = 3;
3398
3399 }else{
3400 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3401 cnv = myData->myConverterArray[tempState];
3402 tempBuf[0] = (char) (mySourceChar);
3403 tempBuf[1] = (char) trailByte;
3404 tempBufLen = 2;
3405 }
3406 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3407 mySourceChar = (mySourceChar << 8) | trailByte;
3408 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3409 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3410 ++mySource;
3411 /* add another bit so that the code below writes 2 bytes in case of error */
3412 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3413 }
3414 if(pToU2022State->g>=2) {
3415 /* return from a single-shift state to the previous one */
3416 pToU2022State->g=pToU2022State->prevG;
3417 }
3418 } else {
3419 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3420 args->converter->toULength = 1;
3421 goto endloop;
3422 }
3423 }
3424 else{
3425 if(mySourceChar <= 0x7f) {
3426 targetUniChar = (UChar) mySourceChar;
3427 }
3428 }
3429 break;
3430 }
3431 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3432 if(args->offsets){
3433 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3434 }
3435 *(myTarget++)=(UChar)targetUniChar;
3436 }
3437 else if(targetUniChar > missingCharMarker){
3438 /* disassemble the surrogate pair and write to output*/
3439 targetUniChar-=0x0010000;
3440 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3441 if(args->offsets){
3442 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3443 }
3444 ++myTarget;
3445 if(myTarget< args->targetLimit){
3446 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3447 if(args->offsets){
3448 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3449 }
3450 ++myTarget;
3451 }else{
3452 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3453 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3454 }
3455
3456 }
3457 else{
3458 /* Call the callback function*/
3459 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3460 break;
3461 }
3462 }
3463 else{
3464 *err =U_BUFFER_OVERFLOW_ERROR;
3465 break;
3466 }
3467 }
3468 endloop:
3469 args->target = myTarget;
3470 args->source = mySource;
3471 }
3472 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3473
3474 static void U_CALLCONV
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3475 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3476 UConverter *cnv = args->converter;
3477 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3478 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3479 char *p, *subchar;
3480 char buffer[8];
3481 int32_t length;
3482
3483 subchar=(char *)cnv->subChars;
3484 length=cnv->subCharLen; /* assume length==1 for most variants */
3485
3486 p = buffer;
3487 switch(myConverterData->locale[0]){
3488 case 'j':
3489 {
3490 int8_t cs;
3491
3492 if(pFromU2022State->g == 1) {
3493 /* JIS7: switch from G1 to G0 */
3494 pFromU2022State->g = 0;
3495 *p++ = UCNV_SI;
3496 }
3497
3498 cs = pFromU2022State->cs[0];
3499 if(cs != ASCII && cs != JISX201) {
3500 /* not in ASCII or JIS X 0201: switch to ASCII */
3501 pFromU2022State->cs[0] = (int8_t)ASCII;
3502 *p++ = '\x1b';
3503 *p++ = '\x28';
3504 *p++ = '\x42';
3505 }
3506
3507 *p++ = subchar[0];
3508 break;
3509 }
3510 case 'c':
3511 if(pFromU2022State->g != 0) {
3512 /* not in ASCII mode: switch to ASCII */
3513 pFromU2022State->g = 0;
3514 *p++ = UCNV_SI;
3515 }
3516 *p++ = subchar[0];
3517 break;
3518 case 'k':
3519 if(myConverterData->version == 0) {
3520 if(length == 1) {
3521 if((UBool)args->converter->fromUnicodeStatus) {
3522 /* in DBCS mode: switch to SBCS */
3523 args->converter->fromUnicodeStatus = 0;
3524 *p++ = UCNV_SI;
3525 }
3526 *p++ = subchar[0];
3527 } else /* length == 2*/ {
3528 if(!(UBool)args->converter->fromUnicodeStatus) {
3529 /* in SBCS mode: switch to DBCS */
3530 args->converter->fromUnicodeStatus = 1;
3531 *p++ = UCNV_SO;
3532 }
3533 *p++ = subchar[0];
3534 *p++ = subchar[1];
3535 }
3536 break;
3537 } else {
3538 /* save the subconverter's substitution string */
3539 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3540 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3541
3542 /* set our substitution string into the subconverter */
3543 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3544 myConverterData->currentConverter->subCharLen = (int8_t)length;
3545
3546 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3547 args->converter = myConverterData->currentConverter;
3548 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3549 ucnv_cbFromUWriteSub(args, 0, err);
3550 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3551 args->converter = cnv;
3552
3553 /* restore the subconverter's substitution string */
3554 myConverterData->currentConverter->subChars = currentSubChars;
3555 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3556
3557 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3558 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3559 uprv_memcpy(
3560 cnv->charErrorBuffer,
3561 myConverterData->currentConverter->charErrorBuffer,
3562 myConverterData->currentConverter->charErrorBufferLength);
3563 }
3564 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3565 myConverterData->currentConverter->charErrorBufferLength = 0;
3566 }
3567 return;
3568 }
3569 default:
3570 /* not expected */
3571 break;
3572 }
3573 ucnv_cbFromUWriteBytes(args,
3574 buffer, (int32_t)(p - buffer),
3575 offsetIndex, err);
3576 }
3577
3578 /*
3579 * Structure for cloning an ISO 2022 converter into a single memory block.
3580 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3581 * and then ucnv_safeClone() of the sub-converter may additionally align
3582 * currentConverter inside the cloneStruct, for which we need the deadSpace
3583 * after currentConverter.
3584 * This is because UAlignedMemory may be larger than the actually
3585 * necessary alignment size for the platform.
3586 * The other cloneStruct fields will not be moved around,
3587 * and are aligned properly with cloneStruct's alignment.
3588 */
3589 struct cloneStruct
3590 {
3591 UConverter cnv;
3592 UConverter currentConverter;
3593 UAlignedMemory deadSpace;
3594 UConverterDataISO2022 mydata;
3595 };
3596
3597
3598 U_CDECL_BEGIN
3599
3600 static UConverter * U_CALLCONV
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3601 _ISO_2022_SafeClone(
3602 const UConverter *cnv,
3603 void *stackBuffer,
3604 int32_t *pBufferSize,
3605 UErrorCode *status)
3606 {
3607 struct cloneStruct * localClone;
3608 UConverterDataISO2022 *cnvData;
3609 int32_t i, size;
3610
3611 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3612 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3613 return NULL;
3614 }
3615
3616 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3617 localClone = (struct cloneStruct *)stackBuffer;
3618
3619 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3620
3621 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3622 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3623 localClone->cnv.isExtraLocal = TRUE;
3624
3625 /* share the subconverters */
3626
3627 if(cnvData->currentConverter != NULL) {
3628 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3629 localClone->mydata.currentConverter =
3630 ucnv_safeClone(cnvData->currentConverter,
3631 &localClone->currentConverter,
3632 &size, status);
3633 if(U_FAILURE(*status)) {
3634 return NULL;
3635 }
3636 }
3637
3638 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3639 if(cnvData->myConverterArray[i] != NULL) {
3640 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3641 }
3642 }
3643
3644 return &localClone->cnv;
3645 }
3646
3647 U_CDECL_END
3648
3649 static void U_CALLCONV
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3650 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3651 const USetAdder *sa,
3652 UConverterUnicodeSet which,
3653 UErrorCode *pErrorCode)
3654 {
3655 int32_t i;
3656 UConverterDataISO2022* cnvData;
3657
3658 if (U_FAILURE(*pErrorCode)) {
3659 return;
3660 }
3661 #ifdef U_ENABLE_GENERIC_ISO_2022
3662 if (cnv->sharedData == &_ISO2022Data) {
3663 /* We use UTF-8 in this case */
3664 sa->addRange(sa->set, 0, 0xd7FF);
3665 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3666 return;
3667 }
3668 #endif
3669
3670 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3671
3672 /* open a set and initialize it with code points that are algorithmically round-tripped */
3673 switch(cnvData->locale[0]){
3674 case 'j':
3675 /* include JIS X 0201 which is hardcoded */
3676 sa->add(sa->set, 0xa5);
3677 sa->add(sa->set, 0x203e);
3678 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3679 /* include Latin-1 for some variants of JP */
3680 sa->addRange(sa->set, 0, 0xff);
3681 } else {
3682 /* include ASCII for JP */
3683 sa->addRange(sa->set, 0, 0x7f);
3684 }
3685 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3686 /*
3687 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3688 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3689 * use half-width Katakana.
3690 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3691 * half-width Katakana via the ESC ( I sequence.
3692 * However, we only emit (fromUnicode) half-width Katakana according to the
3693 * definition of each variant.
3694 *
3695 * When including fallbacks,
3696 * we need to include half-width Katakana Unicode code points for all JP variants because
3697 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3698 */
3699 /* include half-width Katakana for JP */
3700 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3701 }
3702 break;
3703 #if !UCONFIG_ONLY_HTML_CONVERSION
3704 case 'c':
3705 case 'z':
3706 /* include ASCII for CN */
3707 sa->addRange(sa->set, 0, 0x7f);
3708 break;
3709 case 'k':
3710 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3711 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3712 cnvData->currentConverter, sa, which, pErrorCode);
3713 /* the loop over myConverterArray[] will simply not find another converter */
3714 break;
3715 #endif
3716 default:
3717 break;
3718 }
3719
3720 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3721 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3722 cnvData->version==0 && i==CNS_11643
3723 ) {
3724 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3725 ucnv_MBCSGetUnicodeSetForBytes(
3726 cnvData->myConverterArray[i],
3727 sa, UCNV_ROUNDTRIP_SET,
3728 0, 0x81, 0x82,
3729 pErrorCode);
3730 }
3731 #endif
3732
3733 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3734 UConverterSetFilter filter;
3735 if(cnvData->myConverterArray[i]!=NULL) {
3736 if(cnvData->locale[0]=='j' && i==JISX208) {
3737 /*
3738 * Only add code points that map to Shift-JIS codes
3739 * corresponding to JIS X 0208.
3740 */
3741 filter=UCNV_SET_FILTER_SJIS;
3742 #if !UCONFIG_ONLY_HTML_CONVERSION
3743 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3744 cnvData->version==0 && i==CNS_11643) {
3745 /*
3746 * Version-specific for CN:
3747 * CN version 0 does not map CNS planes 3..7 although
3748 * they are all available in the CNS conversion table;
3749 * CN version 1 (-EXT) does map them all.
3750 * The two versions create different Unicode sets.
3751 */
3752 filter=UCNV_SET_FILTER_2022_CN;
3753 } else if(i==KSC5601) {
3754 /*
3755 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3756 * are broader than GR94.
3757 */
3758 filter=UCNV_SET_FILTER_GR94DBCS;
3759 #endif
3760 } else {
3761 filter=UCNV_SET_FILTER_NONE;
3762 }
3763 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3764 }
3765 }
3766
3767 /*
3768 * ISO 2022 converters must not convert SO/SI/ESC despite what
3769 * sub-converters do by themselves.
3770 * Remove these characters from the set.
3771 */
3772 sa->remove(sa->set, 0x0e);
3773 sa->remove(sa->set, 0x0f);
3774 sa->remove(sa->set, 0x1b);
3775
3776 /* ISO 2022 converters do not convert C1 controls either */
3777 sa->removeRange(sa->set, 0x80, 0x9f);
3778 }
3779
3780 static const UConverterImpl _ISO2022Impl={
3781 UCNV_ISO_2022,
3782
3783 NULL,
3784 NULL,
3785
3786 _ISO2022Open,
3787 _ISO2022Close,
3788 _ISO2022Reset,
3789
3790 #ifdef U_ENABLE_GENERIC_ISO_2022
3791 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3792 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3793 ucnv_fromUnicode_UTF8,
3794 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3795 #else
3796 NULL,
3797 NULL,
3798 NULL,
3799 NULL,
3800 #endif
3801 NULL,
3802
3803 NULL,
3804 _ISO2022getName,
3805 _ISO_2022_WriteSub,
3806 _ISO_2022_SafeClone,
3807 _ISO_2022_GetUnicodeSet,
3808
3809 NULL,
3810 NULL
3811 };
3812 static const UConverterStaticData _ISO2022StaticData={
3813 sizeof(UConverterStaticData),
3814 "ISO_2022",
3815 2022,
3816 UCNV_IBM,
3817 UCNV_ISO_2022,
3818 1,
3819 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3820 { 0x1a, 0, 0, 0 },
3821 1,
3822 FALSE,
3823 FALSE,
3824 0,
3825 0,
3826 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3827 };
3828 const UConverterSharedData _ISO2022Data=
3829 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3830
3831 /*************JP****************/
3832 static const UConverterImpl _ISO2022JPImpl={
3833 UCNV_ISO_2022,
3834
3835 NULL,
3836 NULL,
3837
3838 _ISO2022Open,
3839 _ISO2022Close,
3840 _ISO2022Reset,
3841
3842 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3843 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3844 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3845 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3846 NULL,
3847
3848 NULL,
3849 _ISO2022getName,
3850 _ISO_2022_WriteSub,
3851 _ISO_2022_SafeClone,
3852 _ISO_2022_GetUnicodeSet,
3853
3854 NULL,
3855 NULL
3856 };
3857 static const UConverterStaticData _ISO2022JPStaticData={
3858 sizeof(UConverterStaticData),
3859 "ISO_2022_JP",
3860 0,
3861 UCNV_IBM,
3862 UCNV_ISO_2022,
3863 1,
3864 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3865 { 0x1a, 0, 0, 0 },
3866 1,
3867 FALSE,
3868 FALSE,
3869 0,
3870 0,
3871 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3872 };
3873
3874 namespace {
3875
3876 const UConverterSharedData _ISO2022JPData=
3877 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3878
3879 } // namespace
3880
3881 #if !UCONFIG_ONLY_HTML_CONVERSION
3882 /************* KR ***************/
3883 static const UConverterImpl _ISO2022KRImpl={
3884 UCNV_ISO_2022,
3885
3886 NULL,
3887 NULL,
3888
3889 _ISO2022Open,
3890 _ISO2022Close,
3891 _ISO2022Reset,
3892
3893 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3894 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3895 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3896 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3897 NULL,
3898
3899 NULL,
3900 _ISO2022getName,
3901 _ISO_2022_WriteSub,
3902 _ISO_2022_SafeClone,
3903 _ISO_2022_GetUnicodeSet,
3904
3905 NULL,
3906 NULL
3907 };
3908 static const UConverterStaticData _ISO2022KRStaticData={
3909 sizeof(UConverterStaticData),
3910 "ISO_2022_KR",
3911 0,
3912 UCNV_IBM,
3913 UCNV_ISO_2022,
3914 1,
3915 8, /* max 8 bytes per UChar */
3916 { 0x1a, 0, 0, 0 },
3917 1,
3918 FALSE,
3919 FALSE,
3920 0,
3921 0,
3922 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3923 };
3924
3925 namespace {
3926
3927 const UConverterSharedData _ISO2022KRData=
3928 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3929
3930 } // namespace
3931
3932 /*************** CN ***************/
3933 static const UConverterImpl _ISO2022CNImpl={
3934
3935 UCNV_ISO_2022,
3936
3937 NULL,
3938 NULL,
3939
3940 _ISO2022Open,
3941 _ISO2022Close,
3942 _ISO2022Reset,
3943
3944 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3945 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3946 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3947 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3948 NULL,
3949
3950 NULL,
3951 _ISO2022getName,
3952 _ISO_2022_WriteSub,
3953 _ISO_2022_SafeClone,
3954 _ISO_2022_GetUnicodeSet,
3955
3956 NULL,
3957 NULL
3958 };
3959 static const UConverterStaticData _ISO2022CNStaticData={
3960 sizeof(UConverterStaticData),
3961 "ISO_2022_CN",
3962 0,
3963 UCNV_IBM,
3964 UCNV_ISO_2022,
3965 1,
3966 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3967 { 0x1a, 0, 0, 0 },
3968 1,
3969 FALSE,
3970 FALSE,
3971 0,
3972 0,
3973 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3974 };
3975
3976 namespace {
3977
3978 const UConverterSharedData _ISO2022CNData=
3979 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3980
3981 } // namespace
3982 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3983
3984 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3985