1 /*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 */
22
23 /* ---------------------------------------------------------------------
24
25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26 Author: Mark E. Davis, 1994.
27 Rev History: Rick McGowan, fixes & updates May 2001.
28 Sept 2001: fixed const & error conditions per
29 mods suggested by S. Parent & A. Lillich.
30 June 2002: Tim Dodd added detection and handling of incomplete
31 source sequences, enhanced error detection, added casts
32 to eliminate compiler warnings.
33 July 2003: slight mods to back out aggressive FFFE detection.
34 Jan 2004: updated switches in from-UTF8 conversions.
35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37 See the header file "ConvertUTF.h" for complete documentation.
38
39 ------------------------------------------------------------------------ */
40
41
42 #include "antlr3convertutf.h"
43
44 #ifdef CVTUTF_DEBUG
45 #include <stdio.h>
46 #endif
47
48
49
50 /* --------------------------------------------------------------------- */
51
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)52 ConversionResult ConvertUTF32toUTF16 (
53 const UTF32** sourceStart, const UTF32* sourceEnd,
54 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
55 ConversionResult result = conversionOK;
56 const UTF32* source = *sourceStart;
57 UTF16* target = *targetStart;
58 while (source < sourceEnd) {
59 UTF32 ch;
60 if (target >= targetEnd) {
61 result = targetExhausted; break;
62 }
63 ch = *source++;
64 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
65 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
66 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
67 if (flags == strictConversion) {
68 --source; /* return to the illegal value itself */
69 result = sourceIllegal;
70 break;
71 } else {
72 *target++ = UNI_REPLACEMENT_CHAR;
73 }
74 } else {
75 *target++ = (UTF16)ch; /* normal case */
76 }
77 } else if (ch > UNI_MAX_LEGAL_UTF32) {
78 if (flags == strictConversion) {
79 result = sourceIllegal;
80 } else {
81 *target++ = UNI_REPLACEMENT_CHAR;
82 }
83 } else {
84 /* target is a character in range 0xFFFF - 0x10FFFF. */
85 if (target + 1 >= targetEnd) {
86 --source; /* Back up source pointer! */
87 result = targetExhausted; break;
88 }
89 ch -= halfBase;
90 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
91 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
92 }
93 }
94 *sourceStart = source;
95 *targetStart = target;
96 return result;
97 }
98
99 /* --------------------------------------------------------------------- */
100
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)101 ConversionResult ConvertUTF16toUTF32 (
102 const UTF16** sourceStart, const UTF16* sourceEnd,
103 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
104 ConversionResult result = conversionOK;
105 const UTF16* source = *sourceStart;
106 UTF32* target = *targetStart;
107 UTF32 ch, ch2;
108 while (source < sourceEnd) {
109 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
110 ch = *source++;
111 /* If we have a surrogate pair, convert to UTF32 first. */
112 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
113 /* If the 16 bits following the high surrogate are in the source buffer... */
114 if (source < sourceEnd) {
115 ch2 = *source;
116 /* If it's a low surrogate, convert to UTF32. */
117 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
118 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
119 + (ch2 - UNI_SUR_LOW_START) + halfBase;
120 ++source;
121 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
122 --source; /* return to the illegal value itself */
123 result = sourceIllegal;
124 break;
125 }
126 } else { /* We don't have the 16 bits following the high surrogate. */
127 --source; /* return to the high surrogate */
128 result = sourceExhausted;
129 break;
130 }
131 } else if (flags == strictConversion) {
132 /* UTF-16 surrogate values are illegal in UTF-32 */
133 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
134 --source; /* return to the illegal value itself */
135 result = sourceIllegal;
136 break;
137 }
138 }
139 if (target >= targetEnd) {
140 source = oldSource; /* Back up source pointer! */
141 result = targetExhausted; break;
142 }
143 *target++ = ch;
144 }
145 *sourceStart = source;
146 *targetStart = target;
147 #ifdef CVTUTF_DEBUG
148 if (result == sourceIllegal) {
149 ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
150 fflush(stderr);
151 }
152 #endif
153 return result;
154 }
155
156 /* --------------------------------------------------------------------- */
157
158 /*
159 * Index into the table below with the first byte of a UTF-8 sequence to
160 * get the number of trailing bytes that are supposed to follow it.
161 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
162 * left as-is for anyone who may want to do such conversion, which was
163 * allowed in earlier algorithms.
164 */
165 static const char trailingBytesForUTF8[256] = {
166 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
167 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
169 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
170 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
173 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
174 };
175
176 /*
177 * Magic values subtracted from a buffer value during UTF8 conversion.
178 * This table contains as many values as there might be trailing bytes
179 * in a UTF-8 sequence.
180 */
181 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
182 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
183
184 /*
185 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
186 * into the first byte, depending on how many bytes follow. There are
187 * as many entries in this table as there are UTF-8 sequence types.
188 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
189 * for *legal* UTF-8 will be 4 or fewer bytes total.
190 */
191 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
192
193 /* --------------------------------------------------------------------- */
194
195 /* The interface converts a whole buffer to avoid function-call overhead.
196 * Constants have been gathered. Loops & conditionals have been removed as
197 * much as possible for efficiency, in favor of drop-through switches.
198 * (See "Note A" at the bottom of the file for equivalent code.)
199 * If your compiler supports it, the "isLegalUTF8" call can be turned
200 * into an inline function.
201 */
202
203 /* --------------------------------------------------------------------- */
204
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)205 ConversionResult ConvertUTF16toUTF8 (
206 const UTF16** sourceStart, const UTF16* sourceEnd,
207 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
208 ConversionResult result = conversionOK;
209 const UTF16* source = *sourceStart;
210 UTF8* target = *targetStart;
211 while (source < sourceEnd) {
212 UTF32 ch;
213 unsigned short bytesToWrite = 0;
214 const UTF32 byteMask = 0xBF;
215 const UTF32 byteMark = 0x80;
216 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
217 ch = *source++;
218 /* If we have a surrogate pair, convert to UTF32 first. */
219 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
220 /* If the 16 bits following the high surrogate are in the source buffer... */
221 if (source < sourceEnd) {
222 UTF32 ch2 = *source;
223 /* If it's a low surrogate, convert to UTF32. */
224 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
225 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
226 + (ch2 - UNI_SUR_LOW_START) + halfBase;
227 ++source;
228 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
229 --source; /* return to the illegal value itself */
230 result = sourceIllegal;
231 break;
232 }
233 } else { /* We don't have the 16 bits following the high surrogate. */
234 --source; /* return to the high surrogate */
235 result = sourceExhausted;
236 break;
237 }
238 } else if (flags == strictConversion) {
239 /* UTF-16 surrogate values are illegal in UTF-32 */
240 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
241 --source; /* return to the illegal value itself */
242 result = sourceIllegal;
243 break;
244 }
245 }
246 /* Figure out how many bytes the result will require */
247 if (ch < (UTF32)0x80) { bytesToWrite = 1;
248 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
249 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
250 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
251 } else { bytesToWrite = 3;
252 ch = UNI_REPLACEMENT_CHAR;
253 }
254
255 target += bytesToWrite;
256 if (target > targetEnd) {
257 source = oldSource; /* Back up source pointer! */
258 target -= bytesToWrite; result = targetExhausted; break;
259 }
260 switch (bytesToWrite) { /* note: everything falls through. */
261 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
262 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
263 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
264 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
265 }
266 target += bytesToWrite;
267 }
268 *sourceStart = source;
269 *targetStart = target;
270 return result;
271 }
272
273 /* --------------------------------------------------------------------- */
274
275 /*
276 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
277 * This must be called with the length pre-determined by the first byte.
278 * If not calling this from ConvertUTF8to*, then the length can be set by:
279 * length = trailingBytesForUTF8[*source]+1;
280 * and the sequence is illegal right away if there aren't that many bytes
281 * available.
282 * If presented with a length > 4, this returns false. The Unicode
283 * definition of UTF-8 goes up to 4-byte sequences.
284 */
285
286 static ANTLR3_BOOLEAN
isLegalUTF8(const UTF8 * source,int length)287 isLegalUTF8(const UTF8 *source, int length) {
288 UTF8 a;
289 const UTF8 *srcptr = source+length;
290 switch (length) {
291 default: return false;
292 /* Everything else falls through when "true"... */
293 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
294 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
295 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
296
297 switch (*source) {
298 /* no fall-through in this inner switch */
299 case 0xE0: if (a < 0xA0) return false; break;
300 case 0xED: if (a > 0x9F) return false; break;
301 case 0xF0: if (a < 0x90) return false; break;
302 case 0xF4: if (a > 0x8F) return false; break;
303 default: if (a < 0x80) return false;
304 }
305
306 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
307 }
308 if (*source > 0xF4) return false;
309 return true;
310 }
311
312 /* --------------------------------------------------------------------- */
313
314 /*
315 * Exported function to return whether a UTF-8 sequence is legal or not.
316 * This is not used here; it's just exported.
317 */
318 ANTLR3_BOOLEAN
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)319 isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
320 int length = trailingBytesForUTF8[*source]+1;
321 if (source+length > sourceEnd) {
322 return false;
323 }
324 return isLegalUTF8(source, length);
325 }
326
327 /* --------------------------------------------------------------------- */
328
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)329 ConversionResult ConvertUTF8toUTF16 (
330 const UTF8** sourceStart, const UTF8* sourceEnd,
331 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
332 ConversionResult result = conversionOK;
333 const UTF8* source = *sourceStart;
334 UTF16* target = *targetStart;
335 while (source < sourceEnd) {
336 UTF32 ch = 0;
337 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
338 if (source + extraBytesToRead >= sourceEnd) {
339 result = sourceExhausted; break;
340 }
341 /* Do this check whether lenient or strict */
342 if (! isLegalUTF8(source, extraBytesToRead+1)) {
343 result = sourceIllegal;
344 break;
345 }
346 /*
347 * The cases all fall through. See "Note A" below.
348 */
349 switch (extraBytesToRead) {
350 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
351 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
352 case 3: ch += *source++; ch <<= 6;
353 case 2: ch += *source++; ch <<= 6;
354 case 1: ch += *source++; ch <<= 6;
355 case 0: ch += *source++;
356 }
357 ch -= offsetsFromUTF8[extraBytesToRead];
358
359 if (target >= targetEnd) {
360 source -= (extraBytesToRead+1); /* Back up source pointer! */
361 result = targetExhausted; break;
362 }
363 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
364 /* UTF-16 surrogate values are illegal in UTF-32 */
365 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
366 if (flags == strictConversion) {
367 source -= (extraBytesToRead+1); /* return to the illegal value itself */
368 result = sourceIllegal;
369 break;
370 } else {
371 *target++ = UNI_REPLACEMENT_CHAR;
372 }
373 } else {
374 *target++ = (UTF16)ch; /* normal case */
375 }
376 } else if (ch > UNI_MAX_UTF16) {
377 if (flags == strictConversion) {
378 result = sourceIllegal;
379 source -= (extraBytesToRead+1); /* return to the start */
380 break; /* Bail out; shouldn't continue */
381 } else {
382 *target++ = UNI_REPLACEMENT_CHAR;
383 }
384 } else {
385 /* target is a character in range 0xFFFF - 0x10FFFF. */
386 if (target + 1 >= targetEnd) {
387 source -= (extraBytesToRead+1); /* Back up source pointer! */
388 result = targetExhausted; break;
389 }
390 ch -= halfBase;
391 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
392 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
393 }
394 }
395 *sourceStart = source;
396 *targetStart = target;
397 return result;
398 }
399
400 /* --------------------------------------------------------------------- */
401
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)402 ConversionResult ConvertUTF32toUTF8 (
403 const UTF32** sourceStart, const UTF32* sourceEnd,
404 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
405 ConversionResult result = conversionOK;
406 const UTF32* source = *sourceStart;
407 UTF8* target = *targetStart;
408 while (source < sourceEnd) {
409 UTF32 ch;
410 unsigned short bytesToWrite = 0;
411 const UTF32 byteMask = 0xBF;
412 const UTF32 byteMark = 0x80;
413 ch = *source++;
414 if (flags == strictConversion ) {
415 /* UTF-16 surrogate values are illegal in UTF-32 */
416 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
417 --source; /* return to the illegal value itself */
418 result = sourceIllegal;
419 break;
420 }
421 }
422 /*
423 * Figure out how many bytes the result will require. Turn any
424 * illegally large UTF32 things (> Plane 17) into replacement chars.
425 */
426 if (ch < (UTF32)0x80) { bytesToWrite = 1;
427 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
428 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
429 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
430 } else { bytesToWrite = 3;
431 ch = UNI_REPLACEMENT_CHAR;
432 result = sourceIllegal;
433 }
434
435 target += bytesToWrite;
436 if (target > targetEnd) {
437 --source; /* Back up source pointer! */
438 target -= bytesToWrite; result = targetExhausted; break;
439 }
440 switch (bytesToWrite) { /* note: everything falls through. */
441 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
442 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
443 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
444 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
445 }
446 target += bytesToWrite;
447 }
448 *sourceStart = source;
449 *targetStart = target;
450 return result;
451 }
452
453 /* --------------------------------------------------------------------- */
454
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)455 ConversionResult ConvertUTF8toUTF32 (
456 const UTF8** sourceStart, const UTF8* sourceEnd,
457 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
458 ConversionResult result = conversionOK;
459 const UTF8* source = *sourceStart;
460 UTF32* target = *targetStart;
461 while (source < sourceEnd) {
462 UTF32 ch = 0;
463 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
464 if (source + extraBytesToRead >= sourceEnd) {
465 result = sourceExhausted; break;
466 }
467 /* Do this check whether lenient or strict */
468 if (! isLegalUTF8(source, extraBytesToRead+1)) {
469 result = sourceIllegal;
470 break;
471 }
472 /*
473 * The cases all fall through. See "Note A" below.
474 */
475 switch (extraBytesToRead) {
476 case 5: ch += *source++; ch <<= 6;
477 case 4: ch += *source++; ch <<= 6;
478 case 3: ch += *source++; ch <<= 6;
479 case 2: ch += *source++; ch <<= 6;
480 case 1: ch += *source++; ch <<= 6;
481 case 0: ch += *source++;
482 }
483 ch -= offsetsFromUTF8[extraBytesToRead];
484
485 if (target >= targetEnd) {
486 source -= (extraBytesToRead+1); /* Back up the source pointer! */
487 result = targetExhausted; break;
488 }
489 if (ch <= UNI_MAX_LEGAL_UTF32) {
490 /*
491 * UTF-16 surrogate values are illegal in UTF-32, and anything
492 * over Plane 17 (> 0x10FFFF) is illegal.
493 */
494 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
495 if (flags == strictConversion) {
496 source -= (extraBytesToRead+1); /* return to the illegal value itself */
497 result = sourceIllegal;
498 break;
499 } else {
500 *target++ = UNI_REPLACEMENT_CHAR;
501 }
502 } else {
503 *target++ = ch;
504 }
505 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
506 result = sourceIllegal;
507 *target++ = UNI_REPLACEMENT_CHAR;
508 }
509 }
510 *sourceStart = source;
511 *targetStart = target;
512 return result;
513 }
514
515 /* ---------------------------------------------------------------------
516
517 Note A.
518 The fall-through switches in UTF-8 reading code save a
519 temp variable, some decrements & conditionals. The switches
520 are equivalent to the following loop:
521 {
522 int tmpBytesToRead = extraBytesToRead+1;
523 do {
524 ch += *source++;
525 --tmpBytesToRead;
526 if (tmpBytesToRead) ch <<= 6;
527 } while (tmpBytesToRead > 0);
528 }
529 In UTF-8 writing code, the switches on "bytesToWrite" are
530 similarly unrolled loops.
531
532 --------------------------------------------------------------------- */
533