1 /**
2 *******************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7 
8 package com.ibm.icu.charset;
9 
10 import java.nio.ByteBuffer;
11 import java.nio.CharBuffer;
12 import java.nio.IntBuffer;
13 import java.nio.charset.CoderResult;
14 
15 /**
16  * <h2> Callback API for CharsetICU API </h2>
17  *
18  *  CharsetCallback class defines some error behaviour functions called
19  *  by CharsetDecoderICU and CharsetEncoderICU. The class also provides
20  *  the facility by which clients can write their own callbacks.
21  *
22  *  These functions, although public, should NEVER be called directly.
23  *  They should be used as parameters to the onUmappableCharacter() and
24  *  onMalformedInput() methods, to set the behaviour of a converter
25  *  when it encounters UNMAPPED/INVALID sequences.
26  *  Currently the only way to set callbacks is by using CodingErrorAction.
27  *  In the future we will provide set methods on CharsetEncoder and CharsetDecoder
28  *  that will accept CharsetCallback fields.
29  *
30  * @stable ICU 3.6
31  */
32 
33 public class CharsetCallback {
34     /*
35      * FROM_U, TO_U context options for sub callback
36      */
37     private static final String SUB_STOP_ON_ILLEGAL = "i";
38 
39 //    /*
40 //     * FROM_U, TO_U context options for skip callback
41 //     */
42 //    private static final String SKIP_STOP_ON_ILLEGAL = "i";
43 
44 //    /*
45 //     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
46 //     */
47 //    private static final String ESCAPE_ICU  = null;
48 
49     /*
50      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
51      */
52     private static final String ESCAPE_JAVA     =  "J";
53 
54     /*
55      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
56      * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
57      */
58     private static final String ESCAPE_C        = "C";
59 
60     /*
61      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
62      * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
63      */
64     private static final String ESCAPE_XML_DEC  = "D";
65 
66     /*
67      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
68      * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
69      */
70     private static final String ESCAPE_XML_HEX  = "X";
71 
72     /*
73      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
74      */
75     private static final String ESCAPE_UNICODE  = "U";
76 
77     /*
78      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
79      */
80     private static final String ESCAPE_CSS2  = "S";
81 
82     /*
83      * IS_DEFAULT_IGNORABLE_CODE_POINT
84      * This is to check if a code point has the default ignorable unicode property.
85      * As such, this list needs to be updated if the ignorable code point list ever
86      * changes.
87      * To avoid dependency on other code, this list is hard coded here.
88      * When an ignorable code point is found and is unmappable, the default callbacks
89      * will ignore them.
90      * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
91      *
92      * This list should be sync with the one in ucnv_err.c
93      *
94      */
IS_DEFAULT_IGNORABLE_CODE_POINT(int c)95     private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) {
96         return ((c == 0x00AD) ||
97                 (c == 0x034F) ||
98                 (c == 0x061C) ||
99                 (c == 0x115F) ||
100                 (c == 0x1160) ||
101                 (0x17B4 <= c && c <= 0x17B5) ||
102                 (0x180B <= c && c <= 0x180E) ||
103                 (0x200B <= c && c <= 0x200F) ||
104                 (0x202A <= c && c <= 0x202E) ||
105                 (c == 0x2060) ||
106                 (0x2066 <= c && c <= 0x2069) ||
107                 (0x2061 <= c && c <= 0x2064) ||
108                 (0x206A <= c && c <= 0x206F) ||
109                 (c == 0x3164) ||
110                 (0x0FE00 <= c && c <= 0x0FE0F) ||
111                 (c == 0x0FEFF) ||
112                 (c == 0x0FFA0) ||
113                 (0x01BCA0  <= c && c <= 0x01BCA3) ||
114                 (0x01D173 <= c && c <= 0x01D17A) ||
115                 (c == 0x0E0001) ||
116                 (0x0E0020 <= c && c <= 0x0E007F) ||
117                 (0x0E0100 <= c && c <= 0x0E01EF) ||
118                 (c == 0x2065) ||
119                 (0x0FFF0 <= c && c <= 0x0FFF8) ||
120                 (c == 0x0E0000) ||
121                 (0x0E0002 <= c && c <= 0x0E001F) ||
122                 (0x0E0080 <= c && c <= 0x0E00FF) ||
123                 (0x0E01F0 <= c && c <= 0x0E0FFF)
124                 );
125     }
126     /**
127      * Decoder Callback interface
128      * @stable ICU 3.6
129      */
130     public interface Decoder {
131         /**
132          * This function is called when the bytes in the source cannot be handled,
133          * and this function is meant to handle or fix the error if possible.
134          *
135          * @return Result of decoding action. This returned object is set to an error
136          *  if this function could not handle the conversion.
137          * @stable ICU 3.6
138          */
call(CharsetDecoderICU decoder, Object context, ByteBuffer source, CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr)139         public CoderResult call(CharsetDecoderICU decoder, Object context,
140                                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
141                                 char[] buffer, int length, CoderResult cr);
142     }
143     /**
144      * Encoder Callback interface
145      * @stable ICU 3.6
146      */
147     public interface Encoder {
148         /**
149          * This function is called when the Unicode characters in the source cannot be handled,
150          * and this function is meant to handle or fix the error if possible.
151          * @return Result of decoding action. This returned object is set to an error
152          *  if this function could not handle the conversion.
153          * @stable ICU 3.6
154          */
call(CharsetEncoderICU encoder, Object context, CharBuffer source, ByteBuffer target, IntBuffer offsets, char[] buffer, int length, int cp, CoderResult cr)155         public CoderResult call(CharsetEncoderICU encoder, Object context,
156                                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
157                                 char[] buffer, int length, int cp, CoderResult cr);
158     }
159     /**
160      * Skip callback
161      * @stable ICU 3.6
162      */
163     public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
164         public CoderResult call(CharsetEncoderICU encoder, Object context,
165                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
166                 char[] buffer, int length, int cp, CoderResult cr){
167             if(context==null){
168                 return CoderResult.UNDERFLOW;
169             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
170                 if(!cr.isUnmappable()){
171                     return cr;
172                 }else{
173                     return CoderResult.UNDERFLOW;
174                 }
175             }
176             return cr;
177         }
178     };
179     /**
180      * Skip callback
181      * @stable ICU 3.6
182      */
183     public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
184         public CoderResult call(CharsetDecoderICU decoder, Object context,
185                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
186                 char[] buffer, int length, CoderResult cr){
187             if(context==null){
188                 return CoderResult.UNDERFLOW;
189             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
190                 if(!cr.isUnmappable()){
191                     return cr;
192                 }else{
193                     return CoderResult.UNDERFLOW;
194                 }
195             }
196             return cr;
197         }
198     };
199     /**
200      * Write substitute callback
201      * @stable ICU 3.6
202      */
203     public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
204         public CoderResult call(CharsetEncoderICU encoder, Object context,
205                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
206                 char[] buffer, int length, int cp, CoderResult cr){
207             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
208                 return CoderResult.UNDERFLOW;
209             }else if(context==null){
210                 return encoder.cbFromUWriteSub(encoder, source, target, offsets);
211             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
212                 if(!cr.isUnmappable()){
213                     return cr;
214                 }else{
215                    return encoder.cbFromUWriteSub(encoder, source, target, offsets);
216                 }
217             }
218             return cr;
219         }
220     };
221     private static final char[] kSubstituteChar1 = new char[]{0x1A};
222     private static final char[] kSubstituteChar = new char[] {0xFFFD};
223     /**
224      * Write substitute callback
225      * @stable ICU 3.6
226      */
227     public static final Decoder TO_U_CALLBACK_SUBSTITUTE  = new Decoder() {
228         public CoderResult call(CharsetDecoderICU decoder, Object context,
229                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
230                 char[] buffer, int length, CoderResult cr){
231 
232             CharsetICU cs = (CharsetICU) decoder.charset();
233             /* Use the specified replacement character if it is different than the default one. */
234             boolean useReplacement = true;
235             char [] replacementChar = decoder.replacement().toCharArray();
236             if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) {
237                 useReplacement = false;
238             }
239 
240             /* could optimize this case, just one uchar */
241             if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) {
242                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
243             } else {
244                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
245             }
246         }
247     };
248     /**
249      * Stop callback
250      * @stable ICU 3.6
251      */
252     public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
253         public CoderResult call(CharsetEncoderICU encoder, Object context,
254                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
255                 char[] buffer, int length, int cp, CoderResult cr){
256             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
257                 return CoderResult.UNDERFLOW;
258             }
259             return cr;
260         }
261     };
262     /**
263      * Stop callback
264      * @stable ICU 3.6
265      */
266     public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
267         public CoderResult call(CharsetDecoderICU decoder, Object context,
268                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
269                 char[] buffer, int length, CoderResult cr){
270             return cr;
271         }
272     };
273     private static final int VALUE_STRING_LENGTH = 32;
274     private static final char UNICODE_PERCENT_SIGN_CODEPOINT    = 0x0025;
275     private static final char UNICODE_U_CODEPOINT               = 0x0055;
276     private static final char UNICODE_X_CODEPOINT               = 0x0058;
277     private static final char UNICODE_RS_CODEPOINT              = 0x005C;
278     private static final char UNICODE_U_LOW_CODEPOINT           = 0x0075;
279     private static final char UNICODE_X_LOW_CODEPOINT           = 0x0078;
280     private static final char UNICODE_AMP_CODEPOINT             = 0x0026;
281     private static final char UNICODE_HASH_CODEPOINT            = 0x0023;
282     private static final char UNICODE_SEMICOLON_CODEPOINT       = 0x003B;
283     private static final char UNICODE_PLUS_CODEPOINT            = 0x002B;
284     private static final char UNICODE_LEFT_CURLY_CODEPOINT      = 0x007B;
285     private static final char UNICODE_RIGHT_CURLY_CODEPOINT     = 0x007D;
286     private static final char UNICODE_SPACE_CODEPOINT           = 0x0020;
287     /**
288      * Write escape callback
289      * @stable ICU 4.0
290      */
291     public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() {
292         public CoderResult call(CharsetEncoderICU encoder, Object context,
293                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
294                 char[] buffer, int length, int cp, CoderResult cr){
295             char[] valueString = new char[VALUE_STRING_LENGTH];
296             int valueStringLength = 0;
297             int i = 0;
298 
299             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
300                 return CoderResult.UNDERFLOW;
301             }
302 
303             if (context == null || !(context instanceof String)) {
304                 while (i < length) {
305                     valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
306                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
307                     valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
308                 }
309             } else {
310                 if (((String)context).equals(ESCAPE_JAVA)) {
311                     while (i < length) {
312                         valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
313                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
314                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
315                     }
316                 } else if (((String)context).equals(ESCAPE_C)) {
317                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
318 
319                     if (length == 2) {
320                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
321                         valueStringLength = itou(valueString, valueStringLength, cp, 16, 8);
322                     } else {
323                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
324                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
325                     }
326                 } else if (((String)context).equals(ESCAPE_XML_DEC)) {
327                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
328                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
329                     if (length == 2) {
330                         valueStringLength += itou(valueString, valueStringLength, cp, 10, 0);
331                     } else {
332                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0);
333                     }
334                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
335                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
336                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
337                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
338                     valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
339                     if (length == 2) {
340                         valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
341                     } else {
342                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0);
343                     }
344                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
345                 } else if (((String)context).equals(ESCAPE_UNICODE)) {
346                     valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
347                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
348                     valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT;          /* adding + */
349                     if (length == 2) {
350                         valueStringLength += itou(valueString, valueStringLength,cp, 16, 4);
351                     } else {
352                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
353                     }
354                     valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT;   /* adding } */
355                 } else if (((String)context).equals(ESCAPE_CSS2)) {
356                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
357                     valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
358                     /* Always add space character, because the next character might be whitespace,
359                        which would erroneously be considered the termination of the escape sequence. */
360                     valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT;
361                 } else {
362                     while (i < length) {
363                         valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
364                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
365                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
366                     }
367                 }
368             }
369             return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
370         }
371     };
372     /**
373      * Write escape callback
374      * @stable ICU 4.0
375      */
376     public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() {
377         public CoderResult call(CharsetDecoderICU decoder, Object context,
378                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
379                 char[] buffer, int length, CoderResult cr){
380             char[] uniValueString = new char[VALUE_STRING_LENGTH];
381             int valueStringLength = 0;
382             int i = 0;
383 
384             if (context == null || !(context instanceof String)) {
385                 while (i < length) {
386                     uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
387                     uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding U */
388                     valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
389                 }
390             } else {
391                 if (((String)context).equals(ESCAPE_XML_DEC)) {
392                     while (i < length) {
393                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
394                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
395                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0);
396                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
397                     }
398                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
399                     while (i < length) {
400                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
401                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
402                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;  /* adding x */
403                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0);
404                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
405                     }
406                 } else if (((String)context).equals(ESCAPE_C)) {
407                     while (i < length) {
408                         uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT;         /* adding \ */
409                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;      /* adding x */
410                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
411                     }
412                 } else {
413                     while (i < length) {
414                         uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
415                         uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding X */
416                         itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
417                         valueStringLength += 2;
418                     }
419                 }
420             }
421 
422             cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
423 
424             return cr;
425         }
426     };
427     /***
428      * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE.
429      * Fills in a char string with the radix-based representation of a number padded with zeroes
430      * to minwidth.
431      */
itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth)432     private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) {
433         int length = 0;
434         int digit;
435         int j;
436         char temp;
437 
438         do {
439             digit = i % radix;
440             buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7));
441             i = i/radix;
442         } while (i != 0 && (sourceIndex + length) < buffer.length);
443 
444         while (length < minwidth) {
445             buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */
446         }
447         /* reverses the string */
448         for (j = 0; j < (length / 2); j++) {
449             temp = buffer[(sourceIndex + length - 1) - j];
450             buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j];
451             buffer[sourceIndex + j] = temp;
452         }
453 
454         return length;
455     }
456 
457     /*
458      * No need to create an instance
459      */
CharsetCallback()460     private CharsetCallback() {
461     }
462 }
463