1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2000-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnvisci.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001JUN26
14 * created by: Ram Viswanadha
15 *
16 * Date Name Description
17 * 24/7/2001 Ram Added support for EXT character handling
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
23
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
27 #include "cmemory.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "cstring.h"
31 #include "uassert.h"
32
33 #define UCNV_OPTIONS_VERSION_MASK 0xf
34 #define NUKTA 0x093c
35 #define HALANT 0x094d
36 #define ZWNJ 0x200c /* Zero Width Non Joiner */
37 #define ZWJ 0x200d /* Zero width Joiner */
38 #define INVALID_CHAR 0xffff
39 #define ATR 0xEF /* Attribute code */
40 #define EXT 0xF0 /* Extension code */
41 #define DANDA 0x0964
42 #define DOUBLE_DANDA 0x0965
43 #define ISCII_NUKTA 0xE9
44 #define ISCII_HALANT 0xE8
45 #define ISCII_DANDA 0xEA
46 #define ISCII_INV 0xD9
47 #define ISCII_VOWEL_SIGN_E 0xE0
48 #define INDIC_BLOCK_BEGIN 0x0900
49 #define INDIC_BLOCK_END 0x0D7F
50 #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
51 #define VOCALLIC_RR 0x0931
52 #define LF 0x0A
53 #define ASCII_END 0xA0
54 #define NO_CHAR_MARKER 0xFFFE
55 #define TELUGU_DELTA DELTA * TELUGU
56 #define DEV_ABBR_SIGN 0x0970
57 #define DEV_ANUDATTA 0x0952
58 #define EXT_RANGE_BEGIN 0xA1
59 #define EXT_RANGE_END 0xEE
60
61 #define PNJ_DELTA 0x0100
62 #define PNJ_BINDI 0x0A02
63 #define PNJ_TIPPI 0x0A70
64 #define PNJ_SIGN_VIRAMA 0x0A4D
65 #define PNJ_ADHAK 0x0A71
66 #define PNJ_HA 0x0A39
67 #define PNJ_RRA 0x0A5C
68
69 typedef enum {
70 DEVANAGARI =0,
71 BENGALI,
72 GURMUKHI,
73 GUJARATI,
74 ORIYA,
75 TAMIL,
76 TELUGU,
77 KANNADA,
78 MALAYALAM,
79 DELTA=0x80
80 }UniLang;
81
82 /**
83 * Enumeration for switching code pages if <ATR>+<one of below values>
84 * is encountered
85 */
86 typedef enum {
87 DEF = 0x40,
88 RMN = 0x41,
89 DEV = 0x42,
90 BNG = 0x43,
91 TML = 0x44,
92 TLG = 0x45,
93 ASM = 0x46,
94 ORI = 0x47,
95 KND = 0x48,
96 MLM = 0x49,
97 GJR = 0x4A,
98 PNJ = 0x4B,
99 ARB = 0x71,
100 PES = 0x72,
101 URD = 0x73,
102 SND = 0x74,
103 KSM = 0x75,
104 PST = 0x76
105 }ISCIILang;
106
107 typedef enum {
108 DEV_MASK =0x80,
109 PNJ_MASK =0x40,
110 GJR_MASK =0x20,
111 ORI_MASK =0x10,
112 BNG_MASK =0x08,
113 KND_MASK =0x04,
114 MLM_MASK =0x02,
115 TML_MASK =0x01,
116 ZERO =0x00
117 }MaskEnum;
118
119 #define ISCII_CNV_PREFIX "ISCII,version="
120
121 typedef struct {
122 UChar contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
123 UChar contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
124 uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
125 uint16_t currentDeltaFromUnicode; /* current delta in Indic block */
126 uint16_t currentDeltaToUnicode; /* current delta in Indic block */
127 MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
128 MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */
129 MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */
130 UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */
131 UBool resetToDefaultToUnicode; /* boolean for reseting to default delta and mask when a newline is encountered*/
132 char name[sizeof(ISCII_CNV_PREFIX) + 1];
133 UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
134 } UConverterDataISCII;
135
136 typedef struct LookupDataStruct {
137 UniLang uniLang;
138 MaskEnum maskEnum;
139 ISCIILang isciiLang;
140 } LookupDataStruct;
141
142 static const LookupDataStruct lookupInitialData[]={
143 { DEVANAGARI, DEV_MASK, DEV },
144 { BENGALI, BNG_MASK, BNG },
145 { GURMUKHI, PNJ_MASK, PNJ },
146 { GUJARATI, GJR_MASK, GJR },
147 { ORIYA, ORI_MASK, ORI },
148 { TAMIL, TML_MASK, TML },
149 { TELUGU, KND_MASK, TLG },
150 { KANNADA, KND_MASK, KND },
151 { MALAYALAM, MLM_MASK, MLM }
152 };
153
154 /*
155 * For special handling of certain Gurmukhi characters.
156 * Bit 0 (value 1): PNJ consonant
157 * Bit 1 (value 2): PNJ Bindi Tippi
158 */
159 static const uint8_t pnjMap[80] = {
160 /* 0A00..0A0F */
161 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0A10..0A1F */
163 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
164 /* 0A20..0A2F */
165 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3,
166 /* 0A30..0A3F */
167 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2,
168 /* 0A40..0A4F */
169 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
170 };
171
172 static UBool
isPNJConsonant(UChar32 c)173 isPNJConsonant(UChar32 c) {
174 if (c < 0xa00 || 0xa50 <= c) {
175 return FALSE;
176 } else {
177 return (UBool)(pnjMap[c - 0xa00] & 1);
178 }
179 }
180
181 static UBool
isPNJBindiTippi(UChar32 c)182 isPNJBindiTippi(UChar32 c) {
183 if (c < 0xa00 || 0xa50 <= c) {
184 return FALSE;
185 } else {
186 return (UBool)(pnjMap[c - 0xa00] >> 1);
187 }
188 }
189
_ISCIIOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)190 static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) {
191 if(pArgs->onlyTestIsLoadable) {
192 return;
193 }
194
195 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII));
196
197 if (cnv->extraInfo != NULL) {
198 int32_t len=0;
199 UConverterDataISCII *converterData=
200 (UConverterDataISCII *) cnv->extraInfo;
201 converterData->contextCharToUnicode=NO_CHAR_MARKER;
202 cnv->toUnicodeStatus = missingCharMarker;
203 converterData->contextCharFromUnicode=0x0000;
204 converterData->resetToDefaultToUnicode=FALSE;
205 /* check if the version requested is supported */
206 if ((pArgs->options & UCNV_OPTIONS_VERSION_MASK) < 9) {
207 /* initialize state variables */
208 converterData->currentDeltaFromUnicode
209 = converterData->currentDeltaToUnicode
210 = converterData->defDeltaToUnicode = (uint16_t)(lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA);
211
212 converterData->currentMaskFromUnicode
213 = converterData->currentMaskToUnicode
214 = converterData->defMaskToUnicode = lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].maskEnum;
215
216 converterData->isFirstBuffer=TRUE;
217 (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX);
218 len = (int32_t)uprv_strlen(converterData->name);
219 converterData->name[len]= (char)((pArgs->options & UCNV_OPTIONS_VERSION_MASK) + '0');
220 converterData->name[len+1]=0;
221
222 converterData->prevToUnicodeStatus = 0x0000;
223 } else {
224 uprv_free(cnv->extraInfo);
225 cnv->extraInfo = NULL;
226 *errorCode = U_ILLEGAL_ARGUMENT_ERROR;
227 }
228
229 } else {
230 *errorCode =U_MEMORY_ALLOCATION_ERROR;
231 }
232 }
233
_ISCIIClose(UConverter * cnv)234 static void _ISCIIClose(UConverter *cnv) {
235 if (cnv->extraInfo!=NULL) {
236 if (!cnv->isExtraLocal) {
237 uprv_free(cnv->extraInfo);
238 }
239 cnv->extraInfo=NULL;
240 }
241 }
242
_ISCIIgetName(const UConverter * cnv)243 static const char* _ISCIIgetName(const UConverter* cnv) {
244 if (cnv->extraInfo) {
245 UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo;
246 return myData->name;
247 }
248 return NULL;
249 }
250
_ISCIIReset(UConverter * cnv,UConverterResetChoice choice)251 static void _ISCIIReset(UConverter *cnv, UConverterResetChoice choice) {
252 UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo);
253 if (choice<=UCNV_RESET_TO_UNICODE) {
254 cnv->toUnicodeStatus = missingCharMarker;
255 cnv->mode=0;
256 data->currentDeltaToUnicode=data->defDeltaToUnicode;
257 data->currentMaskToUnicode = data->defMaskToUnicode;
258 data->contextCharToUnicode=NO_CHAR_MARKER;
259 data->prevToUnicodeStatus = 0x0000;
260 }
261 if (choice!=UCNV_RESET_TO_UNICODE) {
262 cnv->fromUChar32=0x0000;
263 data->contextCharFromUnicode=0x00;
264 data->currentMaskFromUnicode=data->defMaskToUnicode;
265 data->currentDeltaFromUnicode=data->defDeltaToUnicode;
266 data->isFirstBuffer=TRUE;
267 data->resetToDefaultToUnicode=FALSE;
268 }
269 }
270
271 /**
272 * The values in validity table are indexed by the lower bits of Unicode
273 * range 0x0900 - 0x09ff. The values have a structure like:
274 * ---------------------------------------------------------------
275 * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
276 * | | | | | ASM | KND | | |
277 * ---------------------------------------------------------------
278 * If a code point is valid in a particular script
279 * then that bit is turned on
280 *
281 * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
282 * to represent these languages
283 *
284 * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
285 * and combine and use 1 bit to represent these languages.
286 *
287 * TODO: It is probably easier to understand and maintain to change this
288 * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
289 */
290
291 static const uint8_t validityTable[128] = {
292 /* This state table is tool generated please do not edit unless you know exactly what you are doing */
293 /* Note: This table was edited to mirror the Windows XP implementation */
294 /*ISCII:Valid:Unicode */
295 /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
296 /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
297 /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
298 /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
299 /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
300 /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
301 /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
302 /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
303 /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
304 /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
305 /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
306 /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
307 /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
308 /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
309 /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
310 /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
311 /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
312 /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
313 /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
314 /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
315 /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
316 /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
317 /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
318 /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
319 /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
320 /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
321 /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
322 /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
323 /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
324 /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
325 /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
326 /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
327 /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
328 /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
329 /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
330 /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
331 /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
332 /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
333 /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
334 /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
335 /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
336 /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK ,
337 /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
338 /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
339 /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
340 /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
341 /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
342 /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
343 /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
344 /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK ,
345 /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
346 /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
347 /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK ,
348 /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
349 /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
350 /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
351 /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
352 /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
353 /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
354 /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
355 /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
356 /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
357 /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
358 /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
359 /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
360 /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
361 /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
362 /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
363 /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO ,
364 /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
365 /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
366 /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
367 /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
368 /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
369 /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
370 /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
371 /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
372 /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
373 /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
374 /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
375 /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
376 /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
377 /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
378 /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
379 /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
380 /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO ,
381 /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO ,
382 /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO ,
383 /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
384 /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
385 /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
386 /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
387 /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
388 /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
389 /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
390 /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
391 /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
392 /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
393 /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
394 /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
395 /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
396 /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
397 /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
398 /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
399 /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
400 /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
401 /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
402 /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
403 /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
404 /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
405 /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
406 /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
407 /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
408 /*
409 * The length of the array is 128 to provide values for 0x900..0x97f.
410 * The last 15 entries for 0x971..0x97f of the validity table are all zero
411 * because no Indic script uses such Unicode code points.
412 */
413 /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO
414 };
415
416 static const uint16_t fromUnicodeTable[128]={
417 0x00a0 ,/* 0x0900 */
418 0x00a1 ,/* 0x0901 */
419 0x00a2 ,/* 0x0902 */
420 0x00a3 ,/* 0x0903 */
421 0xa4e0 ,/* 0x0904 */
422 0x00a4 ,/* 0x0905 */
423 0x00a5 ,/* 0x0906 */
424 0x00a6 ,/* 0x0907 */
425 0x00a7 ,/* 0x0908 */
426 0x00a8 ,/* 0x0909 */
427 0x00a9 ,/* 0x090a */
428 0x00aa ,/* 0x090b */
429 0xA6E9 ,/* 0x090c */
430 0x00ae ,/* 0x090d */
431 0x00ab ,/* 0x090e */
432 0x00ac ,/* 0x090f */
433 0x00ad ,/* 0x0910 */
434 0x00b2 ,/* 0x0911 */
435 0x00af ,/* 0x0912 */
436 0x00b0 ,/* 0x0913 */
437 0x00b1 ,/* 0x0914 */
438 0x00b3 ,/* 0x0915 */
439 0x00b4 ,/* 0x0916 */
440 0x00b5 ,/* 0x0917 */
441 0x00b6 ,/* 0x0918 */
442 0x00b7 ,/* 0x0919 */
443 0x00b8 ,/* 0x091a */
444 0x00b9 ,/* 0x091b */
445 0x00ba ,/* 0x091c */
446 0x00bb ,/* 0x091d */
447 0x00bc ,/* 0x091e */
448 0x00bd ,/* 0x091f */
449 0x00be ,/* 0x0920 */
450 0x00bf ,/* 0x0921 */
451 0x00c0 ,/* 0x0922 */
452 0x00c1 ,/* 0x0923 */
453 0x00c2 ,/* 0x0924 */
454 0x00c3 ,/* 0x0925 */
455 0x00c4 ,/* 0x0926 */
456 0x00c5 ,/* 0x0927 */
457 0x00c6 ,/* 0x0928 */
458 0x00c7 ,/* 0x0929 */
459 0x00c8 ,/* 0x092a */
460 0x00c9 ,/* 0x092b */
461 0x00ca ,/* 0x092c */
462 0x00cb ,/* 0x092d */
463 0x00cc ,/* 0x092e */
464 0x00cd ,/* 0x092f */
465 0x00cf ,/* 0x0930 */
466 0x00d0 ,/* 0x0931 */
467 0x00d1 ,/* 0x0932 */
468 0x00d2 ,/* 0x0933 */
469 0x00d3 ,/* 0x0934 */
470 0x00d4 ,/* 0x0935 */
471 0x00d5 ,/* 0x0936 */
472 0x00d6 ,/* 0x0937 */
473 0x00d7 ,/* 0x0938 */
474 0x00d8 ,/* 0x0939 */
475 0xFFFF ,/* 0x093A */
476 0xFFFF ,/* 0x093B */
477 0x00e9 ,/* 0x093c */
478 0xEAE9 ,/* 0x093d */
479 0x00da ,/* 0x093e */
480 0x00db ,/* 0x093f */
481 0x00dc ,/* 0x0940 */
482 0x00dd ,/* 0x0941 */
483 0x00de ,/* 0x0942 */
484 0x00df ,/* 0x0943 */
485 0xDFE9 ,/* 0x0944 */
486 0x00e3 ,/* 0x0945 */
487 0x00e0 ,/* 0x0946 */
488 0x00e1 ,/* 0x0947 */
489 0x00e2 ,/* 0x0948 */
490 0x00e7 ,/* 0x0949 */
491 0x00e4 ,/* 0x094a */
492 0x00e5 ,/* 0x094b */
493 0x00e6 ,/* 0x094c */
494 0x00e8 ,/* 0x094d */
495 0x00ec ,/* 0x094e */
496 0x00ed ,/* 0x094f */
497 0xA1E9 ,/* 0x0950 */ /* OM Symbol */
498 0xFFFF ,/* 0x0951 */
499 0xF0B8 ,/* 0x0952 */
500 0xFFFF ,/* 0x0953 */
501 0xFFFF ,/* 0x0954 */
502 0xFFFF ,/* 0x0955 */
503 0xFFFF ,/* 0x0956 */
504 0xFFFF ,/* 0x0957 */
505 0xb3e9 ,/* 0x0958 */
506 0xb4e9 ,/* 0x0959 */
507 0xb5e9 ,/* 0x095a */
508 0xbae9 ,/* 0x095b */
509 0xbfe9 ,/* 0x095c */
510 0xC0E9 ,/* 0x095d */
511 0xc9e9 ,/* 0x095e */
512 0x00ce ,/* 0x095f */
513 0xAAe9 ,/* 0x0960 */
514 0xA7E9 ,/* 0x0961 */
515 0xDBE9 ,/* 0x0962 */
516 0xDCE9 ,/* 0x0963 */
517 0x00ea ,/* 0x0964 */
518 0xeaea ,/* 0x0965 */
519 0x00f1 ,/* 0x0966 */
520 0x00f2 ,/* 0x0967 */
521 0x00f3 ,/* 0x0968 */
522 0x00f4 ,/* 0x0969 */
523 0x00f5 ,/* 0x096a */
524 0x00f6 ,/* 0x096b */
525 0x00f7 ,/* 0x096c */
526 0x00f8 ,/* 0x096d */
527 0x00f9 ,/* 0x096e */
528 0x00fa ,/* 0x096f */
529 0xF0BF ,/* 0x0970 */
530 0xFFFF ,/* 0x0971 */
531 0xFFFF ,/* 0x0972 */
532 0xFFFF ,/* 0x0973 */
533 0xFFFF ,/* 0x0974 */
534 0xFFFF ,/* 0x0975 */
535 0xFFFF ,/* 0x0976 */
536 0xFFFF ,/* 0x0977 */
537 0xFFFF ,/* 0x0978 */
538 0xFFFF ,/* 0x0979 */
539 0xFFFF ,/* 0x097a */
540 0xFFFF ,/* 0x097b */
541 0xFFFF ,/* 0x097c */
542 0xFFFF ,/* 0x097d */
543 0xFFFF ,/* 0x097e */
544 0xFFFF ,/* 0x097f */
545 };
546 static const uint16_t toUnicodeTable[256]={
547 0x0000,/* 0x00 */
548 0x0001,/* 0x01 */
549 0x0002,/* 0x02 */
550 0x0003,/* 0x03 */
551 0x0004,/* 0x04 */
552 0x0005,/* 0x05 */
553 0x0006,/* 0x06 */
554 0x0007,/* 0x07 */
555 0x0008,/* 0x08 */
556 0x0009,/* 0x09 */
557 0x000a,/* 0x0a */
558 0x000b,/* 0x0b */
559 0x000c,/* 0x0c */
560 0x000d,/* 0x0d */
561 0x000e,/* 0x0e */
562 0x000f,/* 0x0f */
563 0x0010,/* 0x10 */
564 0x0011,/* 0x11 */
565 0x0012,/* 0x12 */
566 0x0013,/* 0x13 */
567 0x0014,/* 0x14 */
568 0x0015,/* 0x15 */
569 0x0016,/* 0x16 */
570 0x0017,/* 0x17 */
571 0x0018,/* 0x18 */
572 0x0019,/* 0x19 */
573 0x001a,/* 0x1a */
574 0x001b,/* 0x1b */
575 0x001c,/* 0x1c */
576 0x001d,/* 0x1d */
577 0x001e,/* 0x1e */
578 0x001f,/* 0x1f */
579 0x0020,/* 0x20 */
580 0x0021,/* 0x21 */
581 0x0022,/* 0x22 */
582 0x0023,/* 0x23 */
583 0x0024,/* 0x24 */
584 0x0025,/* 0x25 */
585 0x0026,/* 0x26 */
586 0x0027,/* 0x27 */
587 0x0028,/* 0x28 */
588 0x0029,/* 0x29 */
589 0x002a,/* 0x2a */
590 0x002b,/* 0x2b */
591 0x002c,/* 0x2c */
592 0x002d,/* 0x2d */
593 0x002e,/* 0x2e */
594 0x002f,/* 0x2f */
595 0x0030,/* 0x30 */
596 0x0031,/* 0x31 */
597 0x0032,/* 0x32 */
598 0x0033,/* 0x33 */
599 0x0034,/* 0x34 */
600 0x0035,/* 0x35 */
601 0x0036,/* 0x36 */
602 0x0037,/* 0x37 */
603 0x0038,/* 0x38 */
604 0x0039,/* 0x39 */
605 0x003A,/* 0x3A */
606 0x003B,/* 0x3B */
607 0x003c,/* 0x3c */
608 0x003d,/* 0x3d */
609 0x003e,/* 0x3e */
610 0x003f,/* 0x3f */
611 0x0040,/* 0x40 */
612 0x0041,/* 0x41 */
613 0x0042,/* 0x42 */
614 0x0043,/* 0x43 */
615 0x0044,/* 0x44 */
616 0x0045,/* 0x45 */
617 0x0046,/* 0x46 */
618 0x0047,/* 0x47 */
619 0x0048,/* 0x48 */
620 0x0049,/* 0x49 */
621 0x004a,/* 0x4a */
622 0x004b,/* 0x4b */
623 0x004c,/* 0x4c */
624 0x004d,/* 0x4d */
625 0x004e,/* 0x4e */
626 0x004f,/* 0x4f */
627 0x0050,/* 0x50 */
628 0x0051,/* 0x51 */
629 0x0052,/* 0x52 */
630 0x0053,/* 0x53 */
631 0x0054,/* 0x54 */
632 0x0055,/* 0x55 */
633 0x0056,/* 0x56 */
634 0x0057,/* 0x57 */
635 0x0058,/* 0x58 */
636 0x0059,/* 0x59 */
637 0x005a,/* 0x5a */
638 0x005b,/* 0x5b */
639 0x005c,/* 0x5c */
640 0x005d,/* 0x5d */
641 0x005e,/* 0x5e */
642 0x005f,/* 0x5f */
643 0x0060,/* 0x60 */
644 0x0061,/* 0x61 */
645 0x0062,/* 0x62 */
646 0x0063,/* 0x63 */
647 0x0064,/* 0x64 */
648 0x0065,/* 0x65 */
649 0x0066,/* 0x66 */
650 0x0067,/* 0x67 */
651 0x0068,/* 0x68 */
652 0x0069,/* 0x69 */
653 0x006a,/* 0x6a */
654 0x006b,/* 0x6b */
655 0x006c,/* 0x6c */
656 0x006d,/* 0x6d */
657 0x006e,/* 0x6e */
658 0x006f,/* 0x6f */
659 0x0070,/* 0x70 */
660 0x0071,/* 0x71 */
661 0x0072,/* 0x72 */
662 0x0073,/* 0x73 */
663 0x0074,/* 0x74 */
664 0x0075,/* 0x75 */
665 0x0076,/* 0x76 */
666 0x0077,/* 0x77 */
667 0x0078,/* 0x78 */
668 0x0079,/* 0x79 */
669 0x007a,/* 0x7a */
670 0x007b,/* 0x7b */
671 0x007c,/* 0x7c */
672 0x007d,/* 0x7d */
673 0x007e,/* 0x7e */
674 0x007f,/* 0x7f */
675 0x0080,/* 0x80 */
676 0x0081,/* 0x81 */
677 0x0082,/* 0x82 */
678 0x0083,/* 0x83 */
679 0x0084,/* 0x84 */
680 0x0085,/* 0x85 */
681 0x0086,/* 0x86 */
682 0x0087,/* 0x87 */
683 0x0088,/* 0x88 */
684 0x0089,/* 0x89 */
685 0x008a,/* 0x8a */
686 0x008b,/* 0x8b */
687 0x008c,/* 0x8c */
688 0x008d,/* 0x8d */
689 0x008e,/* 0x8e */
690 0x008f,/* 0x8f */
691 0x0090,/* 0x90 */
692 0x0091,/* 0x91 */
693 0x0092,/* 0x92 */
694 0x0093,/* 0x93 */
695 0x0094,/* 0x94 */
696 0x0095,/* 0x95 */
697 0x0096,/* 0x96 */
698 0x0097,/* 0x97 */
699 0x0098,/* 0x98 */
700 0x0099,/* 0x99 */
701 0x009a,/* 0x9a */
702 0x009b,/* 0x9b */
703 0x009c,/* 0x9c */
704 0x009d,/* 0x9d */
705 0x009e,/* 0x9e */
706 0x009f,/* 0x9f */
707 0x00A0,/* 0xa0 */
708 0x0901,/* 0xa1 */
709 0x0902,/* 0xa2 */
710 0x0903,/* 0xa3 */
711 0x0905,/* 0xa4 */
712 0x0906,/* 0xa5 */
713 0x0907,/* 0xa6 */
714 0x0908,/* 0xa7 */
715 0x0909,/* 0xa8 */
716 0x090a,/* 0xa9 */
717 0x090b,/* 0xaa */
718 0x090e,/* 0xab */
719 0x090f,/* 0xac */
720 0x0910,/* 0xad */
721 0x090d,/* 0xae */
722 0x0912,/* 0xaf */
723 0x0913,/* 0xb0 */
724 0x0914,/* 0xb1 */
725 0x0911,/* 0xb2 */
726 0x0915,/* 0xb3 */
727 0x0916,/* 0xb4 */
728 0x0917,/* 0xb5 */
729 0x0918,/* 0xb6 */
730 0x0919,/* 0xb7 */
731 0x091a,/* 0xb8 */
732 0x091b,/* 0xb9 */
733 0x091c,/* 0xba */
734 0x091d,/* 0xbb */
735 0x091e,/* 0xbc */
736 0x091f,/* 0xbd */
737 0x0920,/* 0xbe */
738 0x0921,/* 0xbf */
739 0x0922,/* 0xc0 */
740 0x0923,/* 0xc1 */
741 0x0924,/* 0xc2 */
742 0x0925,/* 0xc3 */
743 0x0926,/* 0xc4 */
744 0x0927,/* 0xc5 */
745 0x0928,/* 0xc6 */
746 0x0929,/* 0xc7 */
747 0x092a,/* 0xc8 */
748 0x092b,/* 0xc9 */
749 0x092c,/* 0xca */
750 0x092d,/* 0xcb */
751 0x092e,/* 0xcc */
752 0x092f,/* 0xcd */
753 0x095f,/* 0xce */
754 0x0930,/* 0xcf */
755 0x0931,/* 0xd0 */
756 0x0932,/* 0xd1 */
757 0x0933,/* 0xd2 */
758 0x0934,/* 0xd3 */
759 0x0935,/* 0xd4 */
760 0x0936,/* 0xd5 */
761 0x0937,/* 0xd6 */
762 0x0938,/* 0xd7 */
763 0x0939,/* 0xd8 */
764 0x200D,/* 0xd9 */
765 0x093e,/* 0xda */
766 0x093f,/* 0xdb */
767 0x0940,/* 0xdc */
768 0x0941,/* 0xdd */
769 0x0942,/* 0xde */
770 0x0943,/* 0xdf */
771 0x0946,/* 0xe0 */
772 0x0947,/* 0xe1 */
773 0x0948,/* 0xe2 */
774 0x0945,/* 0xe3 */
775 0x094a,/* 0xe4 */
776 0x094b,/* 0xe5 */
777 0x094c,/* 0xe6 */
778 0x0949,/* 0xe7 */
779 0x094d,/* 0xe8 */
780 0x093c,/* 0xe9 */
781 0x0964,/* 0xea */
782 0xFFFF,/* 0xeb */
783 0xFFFF,/* 0xec */
784 0xFFFF,/* 0xed */
785 0xFFFF,/* 0xee */
786 0xFFFF,/* 0xef */
787 0xFFFF,/* 0xf0 */
788 0x0966,/* 0xf1 */
789 0x0967,/* 0xf2 */
790 0x0968,/* 0xf3 */
791 0x0969,/* 0xf4 */
792 0x096a,/* 0xf5 */
793 0x096b,/* 0xf6 */
794 0x096c,/* 0xf7 */
795 0x096d,/* 0xf8 */
796 0x096e,/* 0xf9 */
797 0x096f,/* 0xfa */
798 0xFFFF,/* 0xfb */
799 0xFFFF,/* 0xfc */
800 0xFFFF,/* 0xfd */
801 0xFFFF,/* 0xfe */
802 0xFFFF /* 0xff */
803 };
804
805 static const uint16_t vowelSignESpecialCases[][2]={
806 { 2 /*length of array*/ , 0 },
807 { 0xA4 , 0x0904 },
808 };
809
810 static const uint16_t nuktaSpecialCases[][2]={
811 { 16 /*length of array*/ , 0 },
812 { 0xA6 , 0x090c },
813 { 0xEA , 0x093D },
814 { 0xDF , 0x0944 },
815 { 0xA1 , 0x0950 },
816 { 0xb3 , 0x0958 },
817 { 0xb4 , 0x0959 },
818 { 0xb5 , 0x095a },
819 { 0xba , 0x095b },
820 { 0xbf , 0x095c },
821 { 0xC0 , 0x095d },
822 { 0xc9 , 0x095e },
823 { 0xAA , 0x0960 },
824 { 0xA7 , 0x0961 },
825 { 0xDB , 0x0962 },
826 { 0xDC , 0x0963 },
827 };
828
829
830 #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
831 int32_t offset = (int32_t)(source - args->source-1); \
832 /* write the targetUniChar to target */ \
833 if(target < targetLimit){ \
834 if(targetByteUnit <= 0xFF){ \
835 *(target)++ = (uint8_t)(targetByteUnit); \
836 if(offsets){ \
837 *(offsets++) = offset; \
838 } \
839 }else{ \
840 if (targetByteUnit > 0xFFFF) { \
841 *(target)++ = (uint8_t)(targetByteUnit>>16); \
842 if (offsets) { \
843 --offset; \
844 *(offsets++) = offset; \
845 } \
846 } \
847 if (!(target < targetLimit)) { \
848 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
849 (uint8_t)(targetByteUnit >> 8); \
850 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
851 (uint8_t)targetByteUnit; \
852 *err = U_BUFFER_OVERFLOW_ERROR; \
853 } else { \
854 *(target)++ = (uint8_t)(targetByteUnit>>8); \
855 if(offsets){ \
856 *(offsets++) = offset; \
857 } \
858 if(target < targetLimit){ \
859 *(target)++ = (uint8_t) targetByteUnit; \
860 if(offsets){ \
861 *(offsets++) = offset ; \
862 } \
863 }else{ \
864 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\
865 (uint8_t) (targetByteUnit); \
866 *err = U_BUFFER_OVERFLOW_ERROR; \
867 } \
868 } \
869 } \
870 }else{ \
871 if (targetByteUnit & 0xFF0000) { \
872 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
873 (uint8_t) (targetByteUnit >>16); \
874 } \
875 if(targetByteUnit & 0xFF00){ \
876 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
877 (uint8_t) (targetByteUnit >>8); \
878 } \
879 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
880 (uint8_t) (targetByteUnit); \
881 *err = U_BUFFER_OVERFLOW_ERROR; \
882 } \
883 }
884
885 /* Rules:
886 * Explicit Halant :
887 * <HALANT> + <ZWNJ>
888 * Soft Halant :
889 * <HALANT> + <ZWJ>
890 */
891
UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)892 static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
893 UConverterFromUnicodeArgs * args, UErrorCode * err) {
894 const UChar *source = args->source;
895 const UChar *sourceLimit = args->sourceLimit;
896 unsigned char *target = (unsigned char *) args->target;
897 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
898 int32_t* offsets = args->offsets;
899 uint32_t targetByteUnit = 0x0000;
900 UChar32 sourceChar = 0x0000;
901 UChar32 tempContextFromUnicode = 0x0000; /* For special handling of the Gurmukhi script. */
902 UConverterDataISCII *converterData;
903 uint16_t newDelta=0;
904 uint16_t range = 0;
905 UBool deltaChanged = FALSE;
906
907 if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) {
908 *err = U_ILLEGAL_ARGUMENT_ERROR;
909 return;
910 }
911 /* initialize data */
912 converterData=(UConverterDataISCII*)args->converter->extraInfo;
913 newDelta=converterData->currentDeltaFromUnicode;
914 range = (uint16_t)(newDelta/DELTA);
915
916 if ((sourceChar = args->converter->fromUChar32)!=0) {
917 goto getTrail;
918 }
919
920 /*writing the char to the output stream */
921 while (source < sourceLimit) {
922 /* Write the language code following LF only if LF is not the last character. */
923 if (args->converter->fromUnicodeStatus == LF) {
924 targetByteUnit = ATR<<8;
925 targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang;
926 args->converter->fromUnicodeStatus = 0x0000;
927 /* now append ATR and language code */
928 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
929 if (U_FAILURE(*err)) {
930 break;
931 }
932 }
933
934 sourceChar = *source++;
935 tempContextFromUnicode = converterData->contextCharFromUnicode;
936
937 targetByteUnit = missingCharMarker;
938
939 /*check if input is in ASCII and C0 control codes range*/
940 if (sourceChar <= ASCII_END) {
941 args->converter->fromUnicodeStatus = sourceChar;
942 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err);
943 if (U_FAILURE(*err)) {
944 break;
945 }
946 continue;
947 }
948 switch (sourceChar) {
949 case ZWNJ:
950 /* contextChar has HALANT */
951 if (converterData->contextCharFromUnicode) {
952 converterData->contextCharFromUnicode = 0x00;
953 targetByteUnit = ISCII_HALANT;
954 } else {
955 /* consume ZWNJ and continue */
956 converterData->contextCharFromUnicode = 0x00;
957 continue;
958 }
959 break;
960 case ZWJ:
961 /* contextChar has HALANT */
962 if (converterData->contextCharFromUnicode) {
963 targetByteUnit = ISCII_NUKTA;
964 } else {
965 targetByteUnit =ISCII_INV;
966 }
967 converterData->contextCharFromUnicode = 0x00;
968 break;
969 default:
970 /* is the sourceChar in the INDIC_RANGE? */
971 if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) {
972 /* Danda and Double Danda are valid in Northern scripts.. since Unicode
973 * does not include these codepoints in all Northern scrips we need to
974 * filter them out
975 */
976 if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) {
977 /* find out to which block the souceChar belongs*/
978 range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA);
979 newDelta =(uint16_t)(range*DELTA);
980
981 /* Now are we in the same block as the previous? */
982 if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) {
983 converterData->currentDeltaFromUnicode = newDelta;
984 converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum;
985 deltaChanged =TRUE;
986 converterData->isFirstBuffer=FALSE;
987 }
988
989 if (converterData->currentDeltaFromUnicode == PNJ_DELTA) {
990 if (sourceChar == PNJ_TIPPI) {
991 /* Make sure Tippi is converterd to Bindi. */
992 sourceChar = PNJ_BINDI;
993 } else if (sourceChar == PNJ_ADHAK) {
994 /* This is for consonant cluster handling. */
995 converterData->contextCharFromUnicode = PNJ_ADHAK;
996 }
997
998 }
999 /* Normalize all Indic codepoints to Devanagari and map them to ISCII */
1000 /* now subtract the new delta from sourceChar*/
1001 sourceChar -= converterData->currentDeltaFromUnicode;
1002 }
1003
1004 /* get the target byte unit */
1005 targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar];
1006
1007 /* is the code point valid in current script? */
1008 if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) {
1009 /* Vocallic RR is assigned in ISCII Telugu and Unicode */
1010 if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) {
1011 targetByteUnit=missingCharMarker;
1012 }
1013 }
1014
1015 if (deltaChanged) {
1016 /* we are in a script block which is different than
1017 * previous sourceChar's script block write ATR and language codes
1018 */
1019 uint32_t temp=0;
1020 temp =(uint16_t)(ATR<<8);
1021 temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang);
1022 /* reset */
1023 deltaChanged=FALSE;
1024 /* now append ATR and language code */
1025 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err);
1026 if (U_FAILURE(*err)) {
1027 break;
1028 }
1029 }
1030
1031 if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) {
1032 continue;
1033 }
1034 }
1035 /* reset context char */
1036 converterData->contextCharFromUnicode = 0x00;
1037 break;
1038 }
1039 if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) {
1040 /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
1041 /* reset context char */
1042 converterData->contextCharFromUnicode = 0x0000;
1043 targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit;
1044 /* write targetByteUnit to target */
1045 WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err);
1046 if (U_FAILURE(*err)) {
1047 break;
1048 }
1049 } else if (targetByteUnit != missingCharMarker) {
1050 if (targetByteUnit==ISCII_HALANT) {
1051 converterData->contextCharFromUnicode = (UChar)targetByteUnit;
1052 }
1053 /* write targetByteUnit to target*/
1054 WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
1055 if (U_FAILURE(*err)) {
1056 break;
1057 }
1058 } else {
1059 /* oops.. the code point is unassigned */
1060 /*check if the char is a First surrogate*/
1061 if (U16_IS_SURROGATE(sourceChar)) {
1062 if (U16_IS_SURROGATE_LEAD(sourceChar)) {
1063 getTrail:
1064 /*look ahead to find the trail surrogate*/
1065 if (source < sourceLimit) {
1066 /* test the following code unit */
1067 UChar trail= (*source);
1068 if (U16_IS_TRAIL(trail)) {
1069 source++;
1070 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1071 *err =U_INVALID_CHAR_FOUND;
1072 /* convert this surrogate code point */
1073 /* exit this condition tree */
1074 } else {
1075 /* this is an unmatched lead code unit (1st surrogate) */
1076 /* callback(illegal) */
1077 *err=U_ILLEGAL_CHAR_FOUND;
1078 }
1079 } else {
1080 /* no more input */
1081 *err = U_ZERO_ERROR;
1082 }
1083 } else {
1084 /* this is an unmatched trail code unit (2nd surrogate) */
1085 /* callback(illegal) */
1086 *err=U_ILLEGAL_CHAR_FOUND;
1087 }
1088 } else {
1089 /* callback(unassigned) for a BMP code point */
1090 *err = U_INVALID_CHAR_FOUND;
1091 }
1092
1093 args->converter->fromUChar32=sourceChar;
1094 break;
1095 }
1096 }/* end while(mySourceIndex<mySourceLength) */
1097
1098 /*save the state and return */
1099 args->source = source;
1100 args->target = (char*)target;
1101 }
1102
1103 static const uint16_t lookupTable[][2]={
1104 { ZERO, ZERO }, /*DEFALT*/
1105 { ZERO, ZERO }, /*ROMAN*/
1106 { DEVANAGARI, DEV_MASK },
1107 { BENGALI, BNG_MASK },
1108 { TAMIL, TML_MASK },
1109 { TELUGU, KND_MASK },
1110 { BENGALI, BNG_MASK },
1111 { ORIYA, ORI_MASK },
1112 { KANNADA, KND_MASK },
1113 { MALAYALAM, MLM_MASK },
1114 { GUJARATI, GJR_MASK },
1115 { GURMUKHI, PNJ_MASK }
1116 };
1117
1118 #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\
1119 /* add offset to current Indic Block */ \
1120 if(targetUniChar>ASCII_END && \
1121 targetUniChar != ZWJ && \
1122 targetUniChar != ZWNJ && \
1123 targetUniChar != DANDA && \
1124 targetUniChar != DOUBLE_DANDA){ \
1125 \
1126 targetUniChar+=(uint16_t)(delta); \
1127 } \
1128 /* now write the targetUniChar */ \
1129 if(target<args->targetLimit){ \
1130 *(target)++ = (UChar)targetUniChar; \
1131 if(offsets){ \
1132 *(offsets)++ = (int32_t)(offset); \
1133 } \
1134 }else{ \
1135 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \
1136 (UChar)targetUniChar; \
1137 *err = U_BUFFER_OVERFLOW_ERROR; \
1138 } \
1139 }
1140
1141 #define GET_MAPPING(sourceChar,targetUniChar,data){ \
1142 targetUniChar = toUnicodeTable[(sourceChar)] ; \
1143 /* is the code point valid in current script? */ \
1144 if(sourceChar> ASCII_END && \
1145 (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \
1146 /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \
1147 if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
1148 targetUniChar!=VOCALLIC_RR){ \
1149 targetUniChar=missingCharMarker; \
1150 } \
1151 } \
1152 }
1153
1154 /***********
1155 * Rules for ISCII to Unicode converter
1156 * ISCII is stateful encoding. To convert ISCII bytes to Unicode,
1157 * which has both precomposed and decomposed forms characters
1158 * pre-context and post-context need to be considered.
1159 *
1160 * Post context
1161 * i) ATR : Attribute code is used to declare the font and script switching.
1162 * Currently we only switch scripts and font codes consumed without generating an error
1163 * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
1164 * obsolete characters
1165 * Pre context
1166 * i) Halant: if preceeded by a halant then it is a explicit halant
1167 * ii) Nukta :
1168 * a) if preceeded by a halant then it is a soft halant
1169 * b) if preceeded by specific consonants and the ligatures have pre-composed
1170 * characters in Unicode then convert to pre-composed characters
1171 * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda
1172 *
1173 */
1174
UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1175 static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err) {
1176 const char *source = ( char *) args->source;
1177 UChar *target = args->target;
1178 const char *sourceLimit = args->sourceLimit;
1179 const UChar* targetLimit = args->targetLimit;
1180 uint32_t targetUniChar = 0x0000;
1181 uint8_t sourceChar = 0x0000;
1182 UConverterDataISCII* data;
1183 UChar32* toUnicodeStatus=NULL;
1184 UChar32 tempTargetUniChar = 0x0000;
1185 UChar* contextCharToUnicode= NULL;
1186 UBool found;
1187 int i;
1188 int offset = 0;
1189
1190 if ((args->converter == NULL) || (target < args->target) || (source < args->source)) {
1191 *err = U_ILLEGAL_ARGUMENT_ERROR;
1192 return;
1193 }
1194
1195 data = (UConverterDataISCII*)(args->converter->extraInfo);
1196 contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */
1197 toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/
1198
1199 while (U_SUCCESS(*err) && source<sourceLimit) {
1200
1201 targetUniChar = missingCharMarker;
1202
1203 if (target < targetLimit) {
1204 sourceChar = (unsigned char)*(source)++;
1205
1206 /* look at the post-context preform special processing */
1207 if (*contextCharToUnicode==ATR) {
1208
1209 /* If we have ATR in *contextCharToUnicode then we need to change our
1210 * state to the Indic Script specified by sourceChar
1211 */
1212
1213 /* check if the sourceChar is supported script range*/
1214 if ((uint8_t)(PNJ-sourceChar)<=PNJ-DEV) {
1215 data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA);
1216 data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1];
1217 } else if (sourceChar==DEF) {
1218 /* switch back to default */
1219 data->currentDeltaToUnicode = data->defDeltaToUnicode;
1220 data->currentMaskToUnicode = data->defMaskToUnicode;
1221 } else {
1222 if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) {
1223 /* these are display codes consume and continue */
1224 } else {
1225 *err =U_ILLEGAL_CHAR_FOUND;
1226 /* reset */
1227 *contextCharToUnicode=NO_CHAR_MARKER;
1228 goto CALLBACK;
1229 }
1230 }
1231
1232 /* reset */
1233 *contextCharToUnicode=NO_CHAR_MARKER;
1234
1235 continue;
1236
1237 } else if (*contextCharToUnicode==EXT) {
1238 /* check if sourceChar is in 0xA1-0xEE range */
1239 if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) {
1240 /* We currently support only Anudatta and Devanagari abbreviation sign */
1241 if (sourceChar==0xBF || sourceChar == 0xB8) {
1242 targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA;
1243
1244 /* find out if the mapping is valid in this state */
1245 if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
1246 *contextCharToUnicode= NO_CHAR_MARKER;
1247
1248 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1249 if (data->prevToUnicodeStatus) {
1250 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
1251 data->prevToUnicodeStatus = 0x0000;
1252 }
1253 /* write to target */
1254 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
1255
1256 continue;
1257 }
1258 }
1259 /* byte unit is unassigned */
1260 targetUniChar = missingCharMarker;
1261 *err= U_INVALID_CHAR_FOUND;
1262 } else {
1263 /* only 0xA1 - 0xEE are legal after EXT char */
1264 *contextCharToUnicode= NO_CHAR_MARKER;
1265 *err = U_ILLEGAL_CHAR_FOUND;
1266 }
1267 goto CALLBACK;
1268 } else if (*contextCharToUnicode==ISCII_INV) {
1269 if (sourceChar==ISCII_HALANT) {
1270 targetUniChar = 0x0020; /* replace with space accoding to Indic FAQ */
1271 } else {
1272 targetUniChar = ZWJ;
1273 }
1274
1275 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1276 if (data->prevToUnicodeStatus) {
1277 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
1278 data->prevToUnicodeStatus = 0x0000;
1279 }
1280 /* write to target */
1281 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
1282 /* reset */
1283 *contextCharToUnicode=NO_CHAR_MARKER;
1284 }
1285
1286 /* look at the pre-context and perform special processing */
1287 switch (sourceChar) {
1288 case ISCII_INV:
1289 case EXT:
1290 case ATR:
1291 *contextCharToUnicode = (UChar)sourceChar;
1292
1293 if (*toUnicodeStatus != missingCharMarker) {
1294 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1295 if (data->prevToUnicodeStatus) {
1296 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
1297 data->prevToUnicodeStatus = 0x0000;
1298 }
1299 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
1300 *toUnicodeStatus = missingCharMarker;
1301 }
1302 continue;
1303 case ISCII_DANDA:
1304 /* handle double danda*/
1305 if (*contextCharToUnicode== ISCII_DANDA) {
1306 targetUniChar = DOUBLE_DANDA;
1307 /* clear the context */
1308 *contextCharToUnicode = NO_CHAR_MARKER;
1309 *toUnicodeStatus = missingCharMarker;
1310 } else {
1311 GET_MAPPING(sourceChar,targetUniChar,data);
1312 *contextCharToUnicode = sourceChar;
1313 }
1314 break;
1315 case ISCII_HALANT:
1316 /* handle explicit halant */
1317 if (*contextCharToUnicode == ISCII_HALANT) {
1318 targetUniChar = ZWNJ;
1319 /* clear the context */
1320 *contextCharToUnicode = NO_CHAR_MARKER;
1321 } else {
1322 GET_MAPPING(sourceChar,targetUniChar,data);
1323 *contextCharToUnicode = sourceChar;
1324 }
1325 break;
1326 case 0x0A:
1327 case 0x0D:
1328 data->resetToDefaultToUnicode = TRUE;
1329 GET_MAPPING(sourceChar,targetUniChar,data)
1330 ;
1331 *contextCharToUnicode = sourceChar;
1332 break;
1333
1334 case ISCII_VOWEL_SIGN_E:
1335 i=1;
1336 found=FALSE;
1337 for (; i<vowelSignESpecialCases[0][0]; i++) {
1338 U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases));
1339 if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) {
1340 targetUniChar=vowelSignESpecialCases[i][1];
1341 found=TRUE;
1342 break;
1343 }
1344 }
1345 if (found) {
1346 /* find out if the mapping is valid in this state */
1347 if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
1348 /*targetUniChar += data->currentDeltaToUnicode ;*/
1349 *contextCharToUnicode= NO_CHAR_MARKER;
1350 *toUnicodeStatus = missingCharMarker;
1351 break;
1352 }
1353 }
1354 GET_MAPPING(sourceChar,targetUniChar,data);
1355 *contextCharToUnicode = sourceChar;
1356 break;
1357
1358 case ISCII_NUKTA:
1359 /* handle soft halant */
1360 if (*contextCharToUnicode == ISCII_HALANT) {
1361 targetUniChar = ZWJ;
1362 /* clear the context */
1363 *contextCharToUnicode = NO_CHAR_MARKER;
1364 break;
1365 } else if (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) {
1366 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1367 if (data->prevToUnicodeStatus) {
1368 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
1369 data->prevToUnicodeStatus = 0x0000;
1370 }
1371 /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi.
1372 * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
1373 */
1374 targetUniChar = PNJ_RRA;
1375 WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
1376 if (U_SUCCESS(*err)) {
1377 targetUniChar = PNJ_SIGN_VIRAMA;
1378 WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
1379 if (U_SUCCESS(*err)) {
1380 targetUniChar = PNJ_HA;
1381 WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
1382 } else {
1383 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
1384 }
1385 } else {
1386 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA;
1387 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
1388 }
1389 *toUnicodeStatus = missingCharMarker;
1390 data->contextCharToUnicode = NO_CHAR_MARKER;
1391 continue;
1392 } else {
1393 /* try to handle <CHAR> + ISCII_NUKTA special mappings */
1394 i=1;
1395 found =FALSE;
1396 for (; i<nuktaSpecialCases[0][0]; i++) {
1397 if (nuktaSpecialCases[i][0]==(uint8_t)
1398 *contextCharToUnicode) {
1399 targetUniChar=nuktaSpecialCases[i][1];
1400 found =TRUE;
1401 break;
1402 }
1403 }
1404 if (found) {
1405 /* find out if the mapping is valid in this state */
1406 if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
1407 /*targetUniChar += data->currentDeltaToUnicode ;*/
1408 *contextCharToUnicode= NO_CHAR_MARKER;
1409 *toUnicodeStatus = missingCharMarker;
1410 if (data->currentDeltaToUnicode == PNJ_DELTA) {
1411 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1412 if (data->prevToUnicodeStatus) {
1413 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
1414 data->prevToUnicodeStatus = 0x0000;
1415 }
1416 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
1417 continue;
1418 }
1419 break;
1420 }
1421 /* else fall through to default */
1422 }
1423 /* else fall through to default */
1424 U_FALLTHROUGH;
1425 }
1426 default:GET_MAPPING(sourceChar,targetUniChar,data)
1427 ;
1428 *contextCharToUnicode = sourceChar;
1429 break;
1430 }
1431
1432 if (*toUnicodeStatus != missingCharMarker) {
1433 /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
1434 if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) &&
1435 (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && (targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus) {
1436 /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
1437 offset = (int)(source-args->source - 3);
1438 tempTargetUniChar = PNJ_ADHAK; /* This is necessary to avoid some compiler warnings. */
1439 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,tempTargetUniChar,0,err);
1440 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,data->prevToUnicodeStatus,0,err);
1441 data->prevToUnicodeStatus = 0x0000; /* reset the previous unicode code point */
1442 *toUnicodeStatus = missingCharMarker;
1443 continue;
1444 } else {
1445 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1446 if (data->prevToUnicodeStatus) {
1447 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
1448 data->prevToUnicodeStatus = 0x0000;
1449 }
1450 /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script.
1451 * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
1452 */
1453 if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) {
1454 targetUniChar = PNJ_TIPPI - PNJ_DELTA;
1455 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err);
1456 } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) {
1457 /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
1458 data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA;
1459 } else {
1460 /* write the previously mapped codepoint */
1461 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
1462 }
1463 }
1464 *toUnicodeStatus = missingCharMarker;
1465 }
1466
1467 if (targetUniChar != missingCharMarker) {
1468 /* now save the targetUniChar for delayed write */
1469 *toUnicodeStatus = (UChar) targetUniChar;
1470 if (data->resetToDefaultToUnicode==TRUE) {
1471 data->currentDeltaToUnicode = data->defDeltaToUnicode;
1472 data->currentMaskToUnicode = data->defMaskToUnicode;
1473 data->resetToDefaultToUnicode=FALSE;
1474 }
1475 } else {
1476
1477 /* we reach here only if targetUniChar == missingCharMarker
1478 * so assign codes to reason and err
1479 */
1480 *err = U_INVALID_CHAR_FOUND;
1481 CALLBACK:
1482 args->converter->toUBytes[0] = (uint8_t) sourceChar;
1483 args->converter->toULength = 1;
1484 break;
1485 }
1486
1487 } else {
1488 *err =U_BUFFER_OVERFLOW_ERROR;
1489 break;
1490 }
1491 }
1492
1493 if (U_SUCCESS(*err) && args->flush && source == sourceLimit) {
1494 /* end of the input stream */
1495 UConverter *cnv = args->converter;
1496
1497 if (*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV) {
1498 /* set toUBytes[] */
1499 cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode;
1500 cnv->toULength = 1;
1501
1502 /* avoid looping on truncated sequences */
1503 *contextCharToUnicode = NO_CHAR_MARKER;
1504 } else {
1505 cnv->toULength = 0;
1506 }
1507
1508 if (*toUnicodeStatus != missingCharMarker) {
1509 /* output a remaining target character */
1510 WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),*toUnicodeStatus,data->currentDeltaToUnicode,err);
1511 *toUnicodeStatus = missingCharMarker;
1512 }
1513 }
1514
1515 args->target = target;
1516 args->source = source;
1517 }
1518
1519 /* structure for SafeClone calculations */
1520 struct cloneISCIIStruct {
1521 UConverter cnv;
1522 UConverterDataISCII mydata;
1523 };
1524
1525 static UConverter *
_ISCII_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)1526 _ISCII_SafeClone(const UConverter *cnv,
1527 void *stackBuffer,
1528 int32_t *pBufferSize,
1529 UErrorCode *status)
1530 {
1531 struct cloneISCIIStruct * localClone;
1532 int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct);
1533
1534 if (U_FAILURE(*status)) {
1535 return 0;
1536 }
1537
1538 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
1539 *pBufferSize = bufferSizeNeeded;
1540 return 0;
1541 }
1542
1543 localClone = (struct cloneISCIIStruct *)stackBuffer;
1544 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1545
1546 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII));
1547 localClone->cnv.extraInfo = &localClone->mydata;
1548 localClone->cnv.isExtraLocal = TRUE;
1549
1550 return &localClone->cnv;
1551 }
1552
1553 static void
_ISCIIGetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)1554 _ISCIIGetUnicodeSet(const UConverter *cnv,
1555 const USetAdder *sa,
1556 UConverterUnicodeSet which,
1557 UErrorCode *pErrorCode)
1558 {
1559 int32_t idx, script;
1560 uint8_t mask;
1561
1562 /* Since all ISCII versions allow switching to other ISCII
1563 scripts, we add all roundtrippable characters to this set. */
1564 sa->addRange(sa->set, 0, ASCII_END);
1565 for (script = DEVANAGARI; script <= MALAYALAM; script++) {
1566 mask = (uint8_t)(lookupInitialData[script].maskEnum);
1567 for (idx = 0; idx < DELTA; idx++) {
1568 /* added check for TELUGU character */
1569 if ((validityTable[idx] & mask) || (script==TELUGU && idx==0x31)) {
1570 sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN);
1571 }
1572 }
1573 }
1574 sa->add(sa->set, DANDA);
1575 sa->add(sa->set, DOUBLE_DANDA);
1576 sa->add(sa->set, ZWNJ);
1577 sa->add(sa->set, ZWJ);
1578 }
1579
1580 static const UConverterImpl _ISCIIImpl={
1581
1582 UCNV_ISCII,
1583
1584 NULL,
1585 NULL,
1586
1587 _ISCIIOpen,
1588 _ISCIIClose,
1589 _ISCIIReset,
1590
1591 UConverter_toUnicode_ISCII_OFFSETS_LOGIC,
1592 UConverter_toUnicode_ISCII_OFFSETS_LOGIC,
1593 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC,
1594 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC,
1595 NULL,
1596
1597 NULL,
1598 _ISCIIgetName,
1599 NULL,
1600 _ISCII_SafeClone,
1601 _ISCIIGetUnicodeSet
1602 };
1603
1604 static const UConverterStaticData _ISCIIStaticData={
1605 sizeof(UConverterStaticData),
1606 "ISCII",
1607 0,
1608 UCNV_IBM,
1609 UCNV_ISCII,
1610 1,
1611 4,
1612 { 0x1a, 0, 0, 0 },
1613 0x1,
1614 FALSE,
1615 FALSE,
1616 0x0,
1617 0x0,
1618 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
1619
1620 };
1621
1622 const UConverterSharedData _ISCIIData=
1623 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl);
1624
1625 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
1626