1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9 
10 package com.ibm.icu.charset;
11 
12 import java.nio.ByteBuffer;
13 import java.nio.CharBuffer;
14 import java.nio.IntBuffer;
15 import java.nio.charset.CoderResult;
16 
17 /**
18  * <h2> Callback API for CharsetICU API </h2>
19  *
20  *  CharsetCallback class defines some error behaviour functions called
21  *  by CharsetDecoderICU and CharsetEncoderICU. The class also provides
22  *  the facility by which clients can write their own callbacks.
23  *
24  *  These functions, although public, should NEVER be called directly.
25  *  They should be used as parameters to the onUmappableCharacter() and
26  *  onMalformedInput() methods, to set the behaviour of a converter
27  *  when it encounters UNMAPPED/INVALID sequences.
28  *  Currently the only way to set callbacks is by using CodingErrorAction.
29  *  In the future we will provide set methods on CharsetEncoder and CharsetDecoder
30  *  that will accept CharsetCallback fields.
31  *
32  * @stable ICU 3.6
33  */
34 
35 public class CharsetCallback {
36     /*
37      * FROM_U, TO_U context options for sub callback
38      */
39     private static final String SUB_STOP_ON_ILLEGAL = "i";
40 
41 //    /*
42 //     * FROM_U, TO_U context options for skip callback
43 //     */
44 //    private static final String SKIP_STOP_ON_ILLEGAL = "i";
45 
46 //    /*
47 //     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
48 //     */
49 //    private static final String ESCAPE_ICU  = null;
50 
51     /*
52      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
53      */
54     private static final String ESCAPE_JAVA     =  "J";
55 
56     /*
57      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
58      * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
59      */
60     private static final String ESCAPE_C        = "C";
61 
62     /*
63      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
64      * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
65      */
66     private static final String ESCAPE_XML_DEC  = "D";
67 
68     /*
69      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
70      * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
71      */
72     private static final String ESCAPE_XML_HEX  = "X";
73 
74     /*
75      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
76      */
77     private static final String ESCAPE_UNICODE  = "U";
78 
79     /*
80      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
81      */
82     private static final String ESCAPE_CSS2  = "S";
83 
84     /*
85      * IS_DEFAULT_IGNORABLE_CODE_POINT
86      * This is to check if a code point has the default ignorable unicode property.
87      * As such, this list needs to be updated if the ignorable code point list ever
88      * changes.
89      * To avoid dependency on other code, this list is hard coded here.
90      * When an ignorable code point is found and is unmappable, the default callbacks
91      * will ignore them.
92      * For a list of the default ignorable code points, use this link:
93      * https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
94      *
95      * This list should be sync with the one in ucnv_err.cpp.
96      */
IS_DEFAULT_IGNORABLE_CODE_POINT(int c)97     private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) {
98         return
99             (c == 0x00AD) ||
100             (c == 0x034F) ||
101             (c == 0x061C) ||
102             (c == 0x115F) ||
103             (c == 0x1160) ||
104             (0x17B4 <= c && c <= 0x17B5) ||
105             (0x180B <= c && c <= 0x180E) ||
106             (0x200B <= c && c <= 0x200F) ||
107             (0x202A <= c && c <= 0x202E) ||
108             (0x2060 <= c && c <= 0x206F) ||
109             (c == 0x3164) ||
110             (0xFE00 <= c && c <= 0xFE0F) ||
111             (c == 0xFEFF) ||
112             (c == 0xFFA0) ||
113             (0xFFF0 <= c && c <= 0xFFF8) ||
114             (0x1BCA0 <= c && c <= 0x1BCA3) ||
115             (0x1D173 <= c && c <= 0x1D17A) ||
116             (0xE0000 <= c && c <= 0xE0FFF);
117     }
118     /**
119      * Decoder Callback interface
120      * @stable ICU 3.6
121      */
122     public interface Decoder {
123         /**
124          * This function is called when the bytes in the source cannot be handled,
125          * and this function is meant to handle or fix the error if possible.
126          *
127          * @return Result of decoding action. This returned object is set to an error
128          *  if this function could not handle the conversion.
129          * @stable ICU 3.6
130          */
call(CharsetDecoderICU decoder, Object context, ByteBuffer source, CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr)131         public CoderResult call(CharsetDecoderICU decoder, Object context,
132                                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
133                                 char[] buffer, int length, CoderResult cr);
134     }
135     /**
136      * Encoder Callback interface
137      * @stable ICU 3.6
138      */
139     public interface Encoder {
140         /**
141          * This function is called when the Unicode characters in the source cannot be handled,
142          * and this function is meant to handle or fix the error if possible.
143          * @return Result of decoding action. This returned object is set to an error
144          *  if this function could not handle the conversion.
145          * @stable ICU 3.6
146          */
call(CharsetEncoderICU encoder, Object context, CharBuffer source, ByteBuffer target, IntBuffer offsets, char[] buffer, int length, int cp, CoderResult cr)147         public CoderResult call(CharsetEncoderICU encoder, Object context,
148                                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
149                                 char[] buffer, int length, int cp, CoderResult cr);
150     }
151     /**
152      * Skip callback
153      * @stable ICU 3.6
154      */
155     public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
156         @Override
157         public CoderResult call(CharsetEncoderICU encoder, Object context,
158                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
159                 char[] buffer, int length, int cp, CoderResult cr){
160             if(context==null){
161                 return CoderResult.UNDERFLOW;
162             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
163                 if(!cr.isUnmappable()){
164                     return cr;
165                 }else{
166                     return CoderResult.UNDERFLOW;
167                 }
168             }
169             return cr;
170         }
171     };
172     /**
173      * Skip callback
174      * @stable ICU 3.6
175      */
176     public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
177         @Override
178         public CoderResult call(CharsetDecoderICU decoder, Object context,
179                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
180                 char[] buffer, int length, CoderResult cr){
181             if(context==null){
182                 return CoderResult.UNDERFLOW;
183             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
184                 if(!cr.isUnmappable()){
185                     return cr;
186                 }else{
187                     return CoderResult.UNDERFLOW;
188                 }
189             }
190             return cr;
191         }
192     };
193     /**
194      * Write substitute callback
195      * @stable ICU 3.6
196      */
197     public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
198         @Override
199         public CoderResult call(CharsetEncoderICU encoder, Object context,
200                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
201                 char[] buffer, int length, int cp, CoderResult cr){
202             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
203                 return CoderResult.UNDERFLOW;
204             }else if(context==null){
205                 return encoder.cbFromUWriteSub(encoder, source, target, offsets);
206             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
207                 if(!cr.isUnmappable()){
208                     return cr;
209                 }else{
210                    return encoder.cbFromUWriteSub(encoder, source, target, offsets);
211                 }
212             }
213             return cr;
214         }
215     };
216     private static final char[] kSubstituteChar1 = new char[]{0x1A};
217     private static final char[] kSubstituteChar = new char[] {0xFFFD};
218     /**
219      * Write substitute callback
220      * @stable ICU 3.6
221      */
222     public static final Decoder TO_U_CALLBACK_SUBSTITUTE  = new Decoder() {
223         @Override
224         public CoderResult call(CharsetDecoderICU decoder, Object context,
225                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
226                 char[] buffer, int length, CoderResult cr){
227 
228             CharsetICU cs = (CharsetICU) decoder.charset();
229             /* Use the specified replacement character if it is different than the default one. */
230             boolean useReplacement = true;
231             char [] replacementChar = decoder.replacement().toCharArray();
232             if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) {
233                 useReplacement = false;
234             }
235 
236             /* could optimize this case, just one uchar */
237             if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) {
238                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
239             } else {
240                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
241             }
242         }
243     };
244     /**
245      * Stop callback
246      * @stable ICU 3.6
247      */
248     public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
249         @Override
250         public CoderResult call(CharsetEncoderICU encoder, Object context,
251                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
252                 char[] buffer, int length, int cp, CoderResult cr){
253             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
254                 return CoderResult.UNDERFLOW;
255             }
256             return cr;
257         }
258     };
259     /**
260      * Stop callback
261      * @stable ICU 3.6
262      */
263     public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
264         @Override
265         public CoderResult call(CharsetDecoderICU decoder, Object context,
266                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
267                 char[] buffer, int length, CoderResult cr){
268             return cr;
269         }
270     };
271     private static final int VALUE_STRING_LENGTH = 32;
272     private static final char UNICODE_PERCENT_SIGN_CODEPOINT    = 0x0025;
273     private static final char UNICODE_U_CODEPOINT               = 0x0055;
274     private static final char UNICODE_X_CODEPOINT               = 0x0058;
275     private static final char UNICODE_RS_CODEPOINT              = 0x005C;
276     private static final char UNICODE_U_LOW_CODEPOINT           = 0x0075;
277     private static final char UNICODE_X_LOW_CODEPOINT           = 0x0078;
278     private static final char UNICODE_AMP_CODEPOINT             = 0x0026;
279     private static final char UNICODE_HASH_CODEPOINT            = 0x0023;
280     private static final char UNICODE_SEMICOLON_CODEPOINT       = 0x003B;
281     private static final char UNICODE_PLUS_CODEPOINT            = 0x002B;
282     private static final char UNICODE_LEFT_CURLY_CODEPOINT      = 0x007B;
283     private static final char UNICODE_RIGHT_CURLY_CODEPOINT     = 0x007D;
284     private static final char UNICODE_SPACE_CODEPOINT           = 0x0020;
285     /**
286      * Write escape callback
287      * @stable ICU 4.0
288      */
289     public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() {
290         @Override
291         public CoderResult call(CharsetEncoderICU encoder, Object context,
292                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
293                 char[] buffer, int length, int cp, CoderResult cr){
294             char[] valueString = new char[VALUE_STRING_LENGTH];
295             int valueStringLength = 0;
296             int i = 0;
297 
298             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
299                 return CoderResult.UNDERFLOW;
300             }
301 
302             if (context == null || !(context instanceof String)) {
303                 while (i < length) {
304                     valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
305                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
306                     valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
307                 }
308             } else {
309                 if (((String)context).equals(ESCAPE_JAVA)) {
310                     while (i < length) {
311                         valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
312                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
313                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
314                     }
315                 } else if (((String)context).equals(ESCAPE_C)) {
316                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
317 
318                     if (length == 2) {
319                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
320                         valueStringLength = itou(valueString, valueStringLength, cp, 16, 8);
321                     } else {
322                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
323                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
324                     }
325                 } else if (((String)context).equals(ESCAPE_XML_DEC)) {
326                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
327                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
328                     if (length == 2) {
329                         valueStringLength += itou(valueString, valueStringLength, cp, 10, 0);
330                     } else {
331                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0);
332                     }
333                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
334                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
335                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
336                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
337                     valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
338                     if (length == 2) {
339                         valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
340                     } else {
341                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0);
342                     }
343                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
344                 } else if (((String)context).equals(ESCAPE_UNICODE)) {
345                     valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
346                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
347                     valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT;          /* adding + */
348                     if (length == 2) {
349                         valueStringLength += itou(valueString, valueStringLength,cp, 16, 4);
350                     } else {
351                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
352                     }
353                     valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT;   /* adding } */
354                 } else if (((String)context).equals(ESCAPE_CSS2)) {
355                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
356                     valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
357                     /* Always add space character, because the next character might be whitespace,
358                        which would erroneously be considered the termination of the escape sequence. */
359                     valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT;
360                 } else {
361                     while (i < length) {
362                         valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
363                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
364                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
365                     }
366                 }
367             }
368             return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
369         }
370     };
371     /**
372      * Write escape callback
373      * @stable ICU 4.0
374      */
375     public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() {
376         @Override
377         public CoderResult call(CharsetDecoderICU decoder, Object context,
378                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
379                 char[] buffer, int length, CoderResult cr){
380             char[] uniValueString = new char[VALUE_STRING_LENGTH];
381             int valueStringLength = 0;
382             int i = 0;
383 
384             if (context == null || !(context instanceof String)) {
385                 while (i < length) {
386                     uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
387                     uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding U */
388                     valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
389                 }
390             } else {
391                 if (((String)context).equals(ESCAPE_XML_DEC)) {
392                     while (i < length) {
393                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
394                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
395                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0);
396                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
397                     }
398                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
399                     while (i < length) {
400                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
401                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
402                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;  /* adding x */
403                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0);
404                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
405                     }
406                 } else if (((String)context).equals(ESCAPE_C)) {
407                     while (i < length) {
408                         uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT;         /* adding \ */
409                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;      /* adding x */
410                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
411                     }
412                 } else {
413                     while (i < length) {
414                         uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
415                         uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding X */
416                         itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
417                         valueStringLength += 2;
418                     }
419                 }
420             }
421 
422             cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
423 
424             return cr;
425         }
426     };
427     /***
428      * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE.
429      * Fills in a char string with the radix-based representation of a number padded with zeroes
430      * to minwidth.
431      */
itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth)432     private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) {
433         int length = 0;
434         int digit;
435         int j;
436         char temp;
437 
438         do {
439             digit = i % radix;
440             buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7));
441             i = i/radix;
442         } while (i != 0 && (sourceIndex + length) < buffer.length);
443 
444         while (length < minwidth) {
445             buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */
446         }
447         /* reverses the string */
448         for (j = 0; j < (length / 2); j++) {
449             temp = buffer[(sourceIndex + length - 1) - j];
450             buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j];
451             buffer[sourceIndex + j] = temp;
452         }
453 
454         return length;
455     }
456 
457     /*
458      * No need to create an instance
459      */
CharsetCallback()460     private CharsetCallback() {
461     }
462 }
463