1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2002-2014, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.dev.test.charset;
11 
12 import java.nio.ByteBuffer;
13 import java.nio.CharBuffer;
14 import java.nio.charset.Charset;
15 import java.nio.charset.CharsetDecoder;
16 import java.nio.charset.CharsetEncoder;
17 import java.nio.charset.CoderResult;
18 import java.nio.charset.CodingErrorAction;
19 import java.util.Iterator;
20 import java.util.List;
21 
22 import org.junit.Test;
23 import org.junit.runner.RunWith;
24 
25 import com.ibm.icu.charset.CharsetCallback;
26 import com.ibm.icu.charset.CharsetDecoderICU;
27 import com.ibm.icu.charset.CharsetEncoderICU;
28 import com.ibm.icu.charset.CharsetICU;
29 import com.ibm.icu.charset.CharsetProviderICU;
30 import com.ibm.icu.dev.test.ModuleTest;
31 import com.ibm.icu.dev.test.ModuleTest.TestDataPair;
32 import com.ibm.icu.dev.test.TestDataModule.DataMap;
33 import com.ibm.icu.dev.test.TestDataModule.TestData;
34 import com.ibm.icu.dev.test.TestFmwk;
35 import com.ibm.icu.impl.ICUResourceBundle;
36 import com.ibm.icu.text.UnicodeSet;
37 
38 import junitparams.JUnitParamsRunner;
39 import junitparams.Parameters;
40 
41 /**
42  * This maps to convtest.c which tests the test file for data-driven conversion tests.
43  *
44  */
45 @RunWith(JUnitParamsRunner.class)
46 public class TestConversion extends TestFmwk {
47     /**
48      * This maps to the C struct of conversion case in convtest.h that stores the
49      * data for a conversion test
50      *
51      */
52     private class ConversionCase {
53         int caseNr;                                             // testcase index
54         String option = null;                                   // callback options
55         CodingErrorAction cbErrorAction = null;                 // callback action type
56         CharBuffer toUnicodeResult = null;
57         ByteBuffer fromUnicodeResult = null;
58 
59         // data retrieved from a test case conversion.txt
60         String charset;                                         // charset
61         String unicode;                                         // unicode string
62         ByteBuffer bytes;                                       // byte
63         int[] offsets;                                          // offsets
64         boolean finalFlush;                                     // flush
65         boolean fallbacks;                                      // fallback
66         String outErrorCode;                                    // errorCode
67         String cbopt;                                           // callback
68 
69         // TestGetUnicodeSet variables
70         String map;
71         String mapnot;
72         int which;
73 
74         // CharsetCallback encoder and decoder
75         CharsetCallback.Decoder cbDecoder = null;
76         CharsetCallback.Encoder cbEncoder = null;
77 
caseNrAsString()78         String caseNrAsString() {
79             return "[" + caseNr + "]";
80         }
81     }
82 
83     /* In the data-driven conversion test, converters that are not available in
84      * ICU4J are marked with the following leading symbol.
85      */
86     private static final char UNSUPPORTED_CHARSET_SYMBOL = '+';
87 
88     // public methods --------------------------------------------------------
89 
TestConversion()90     public TestConversion() {
91     }
92 
93     @SuppressWarnings("unused")
getTestData()94     private List<TestDataPair> getTestData() throws Exception {
95         return ModuleTest.getTestData("com/ibm/icu/dev/data/testdata/", "conversion");
96     }
97 
98     /*
99      * This method maps to the convtest.cpp runIndexedTest() method to run each
100      * type of conversion.
101      */
102     @Test
103     @Parameters(method="getTestData")
conversionTest(TestDataPair pair)104     public void conversionTest(TestDataPair pair) {
105         TestData td = pair.td;
106         //DataMap settings = pair.dm;
107 
108         int testFromUnicode = 0;
109         int testToUnicode = 0;
110         String testName = td.getName().toString();
111 
112         // Iterate through and get each of the test case to process
113         for (Iterator iter = td.getDataIterator(); iter.hasNext();) {
114             DataMap testcase = (DataMap) iter.next();
115 
116             if (testName.equalsIgnoreCase("toUnicode")) {
117                 TestToUnicode(testcase, testToUnicode);
118                 testToUnicode++;
119 
120             } else if (testName.equalsIgnoreCase("fromUnicode")) {
121                 TestFromUnicode(testcase, testFromUnicode);
122                 testFromUnicode++;
123             } else if (testName.equalsIgnoreCase("getUnicodeSet")) {
124                 TestGetUnicodeSet(testcase);
125             } else {
126                 warnln("Could not load the test cases for conversion");
127                 //                    continue;
128             }
129         }
130     }
131 
132     // private methods -------------------------------------------------------
133 
134 
135     // fromUnicode test worker functions ---------------------------------------
TestFromUnicode(DataMap testcase, int caseNr)136     private void TestFromUnicode(DataMap testcase, int caseNr) {
137 
138         ConversionCase cc = new ConversionCase();
139 
140         try {
141             // retrieve test case data
142             cc.caseNr = caseNr;
143             cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString();
144             cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString();
145             cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary();
146             cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector();
147             cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0;
148             cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0;
149             cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString();
150             cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString();
151 
152         } catch (Exception e) {
153             errln("Skipping test:");
154             errln("error parsing conversion/toUnicode test case " + cc.caseNr);
155             return;
156         }
157 
158         /*
159          * Skip the following data driven converter tests.
160          * These tests were added to the data driven conversion test in ICU
161          * to test direct-from-UTF-8 m:n Unicode:charset conversion.
162          * This feature is not in ICU4J.
163          * See #9601
164          */
165         // Android patch: Skip tests that fail with customized data.
166         String [] testsToSkip = {
167                 "*test2",
168                 "EUC-TW",
169                 "gb18030",
170                 "ibm-1386",
171                 "ibm-1390",
172                 "ibm-1390,swaplfnl",
173                 "ibm-1399",
174                 "ibm-16684",
175                 "ibm-25546",
176                 "ibm-930",
177                 "ibm-943",
178                 "ibm-970",
179                 "ibm-971",
180                 "IBM-eucJP",
181                 "iso-2022-cn",
182                 "iso-2022-jp",
183                 "ISO-2022-JP-2",
184                 "iso-2022-kr",
185                 "ISO-2022-KR",
186                 "JIS",
187                 "JIS7",
188                 "JIS8",
189                 "lmbcs",
190                 "windows-936",
191                 "x11-compound-text",
192         };
193         // Android patch end.
194         for (int i = 0; i < testsToSkip.length; i++) {
195             if (cc.charset.equals(testsToSkip[i])) {
196                 logln("");
197                 logln("Skipping: " + cc.charset);
198                 logln("...............................................");
199                 return;
200             }
201         }
202 
203         // ----for debugging only
204         logln("");
205         logln("TestFromUnicode[" + caseNr + "] " + cc.charset + " ");
206         logln("Unicode:   " + cc.unicode);
207         logln("Bytes:    " + printbytes(cc.bytes, cc.bytes.limit()));
208         ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes());
209         logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")");
210         logln("...............................................");
211 
212         // process the retrieved test data case
213         if (cc.offsets.length == 0) {
214             cc.offsets = null;
215         } else if (cc.offsets.length != cc.bytes.limit()) {
216             errln("fromUnicode[" + cc.caseNr + "] bytes[" + cc.bytes
217                     + "] and offsets[" + cc.offsets.length
218                     + "] must have the same length");
219             return;
220         }
221 
222         // check the callback replacement value
223         if (cc.cbopt.length() > 0) {
224 
225             switch ((cc.cbopt).charAt(0)) {
226             case '?':
227                 cc.cbErrorAction = CodingErrorAction.REPLACE;
228                 break;
229             case '0':
230                 cc.cbErrorAction = CodingErrorAction.IGNORE;
231                 break;
232             case '.':
233                 cc.cbErrorAction = CodingErrorAction.REPORT;
234                 break;
235             case '&':
236                 cc.cbErrorAction = CodingErrorAction.REPLACE;
237                 cc.cbEncoder = CharsetCallback.FROM_U_CALLBACK_ESCAPE;
238                 break;
239             default:
240                 cc.cbErrorAction = null;
241                 break;
242             }
243 
244             // check for any options for the callback value --
245             cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt
246                     .substring(1);
247             if (cc.option == null) {
248                 cc.option = null;
249             }
250         }
251         FromUnicodeCase(cc);
252     }
253 
254 
FromUnicodeCase(ConversionCase cc)255     private void FromUnicodeCase(ConversionCase cc) {
256         // create charset encoder for conversion test
257         CharsetProviderICU provider = new CharsetProviderICU();
258         CharsetEncoder encoder = null;
259         Charset charset = null;
260         try {
261             // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
262             charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
263                     ? (Charset) provider.charsetForName(cc.charset.substring(1),
264                         "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
265                     : (Charset) provider.charsetForName(cc.charset);
266             if (charset != null) {
267                 encoder = charset.newEncoder();
268                 encoder.onMalformedInput(CodingErrorAction.REPLACE);
269                 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
270                 if (encoder instanceof CharsetEncoderICU) {
271                     ((CharsetEncoderICU)encoder).setFallbackUsed(cc.fallbacks);
272                     if (((CharsetEncoderICU)encoder).isFallbackUsed() != cc.fallbacks) {
273                         errln("Fallback could not be set for " + cc.charset);
274                     }
275                 }
276             }
277         } catch (Exception e) {
278             encoder = null;
279         }
280         if (encoder == null) {
281             if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) {
282                 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time");
283             } else {
284                 errln(cc.charset + " was not found");
285             }
286             return;
287         }
288 
289         // set the callback for the encoder
290         if (cc.cbErrorAction != null) {
291             if (cc.cbEncoder != null) {
292                 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option);
293                 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.unmappableForLength(1), cc.cbEncoder, cc.option);
294             } else {
295                 encoder.onUnmappableCharacter(cc.cbErrorAction);
296                 encoder.onMalformedInput(cc.cbErrorAction);
297             }
298 
299             // if action has an option, put in the option for the case
300             if (cc.option.equals("i")) {
301                 encoder.onMalformedInput(CodingErrorAction.REPORT);
302             }
303 
304             // if callback action is replace,
305             //   and there is a subchar
306             // replace the decoder's default replacement value
307             // if substring, skip test due to current api not supporting
308             // substring
309             if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) {
310                 if (cc.cbopt.length() > 1) {
311                     if (cc.cbopt.length() > 1 && cc.cbopt.charAt(1) == '=') {
312                         logln("Skipping test due to limitation in Java API - substitution string not supported");
313                         return;
314                     } else {
315                         // // read NUL-separated subchar first, if any
316                         // copy the subchar from Latin-1 characters
317                         // start after the NUL
318                         if (cc.cbopt.charAt(1) == 0x00) {
319                             cc.cbopt = cc.cbopt.substring(2);
320 
321                             try {
322                                 encoder.replaceWith(toByteArray(cc.cbopt));
323                             } catch (Exception e) {
324                                 logln("Skipping test due to limitation in Java API - substitution character sequence size error");
325                                 return;
326                             }
327                         }
328                     }
329                 }
330             }
331         }
332 
333         // do charset encoding from unicode
334 
335         // testing by steps using charset.encoder(in,out,flush)
336         int resultLength;
337         boolean ok;
338         String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
339                 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
340         int i, step;
341 
342         ok = true;
343 
344         for (i = 0; i < steps.length && ok; ++i) {
345             step = Integer.parseInt(steps[i][0]);
346 
347             logln("Testing step:[" + step + "]");
348             try {
349                 resultLength = stepFromUnicode(cc, encoder, step);
350                 ok = checkFromUnicode(cc, resultLength);
351             } catch (Exception ex) {
352                 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]");
353                 ex.printStackTrace(System.out);
354                 return;
355             }
356 
357         }
358         // testing by whole buffer using out = charset.encoder(in)
359         while (ok && cc.finalFlush) {
360             logln("Testing java API charset.encoder(in):");
361             cc.fromUnicodeResult = null;
362             ByteBuffer out = null;
363 
364             try {
365                 out = encoder.encode(CharBuffer.wrap(cc.unicode.toCharArray()));
366                 out.position(out.limit());
367                 if (out.limit() != out.capacity() || cc.finalFlush) {
368                     int pos = out.position();
369                     byte[] temp = out.array();
370                     out = ByteBuffer.allocate(temp.length * 4);
371                     out.put(temp);
372                     out.position(pos);
373                     CoderResult cr = encoder.flush(out);
374                     if (cr.isOverflow()) {
375                         logln("Overflow error with flushing encoder");
376                     }
377                 }
378                 cc.fromUnicodeResult = out;
379 
380                 ok = checkFromUnicode(cc, out.limit());
381                 if (!ok) {
382                     break;
383                 }
384             } catch (Exception e) {
385                 //check the error code to see if it matches cc.errorCode
386                 logln("Encoder returned an error code");
387                 logln("ErrorCode expected is: " + cc.outErrorCode);
388                 logln("Error Result is: " + e.toString());
389             }
390             break;
391         }
392     }
393 
stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step)394     private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) {
395         if (step < 0) {
396             errln("Negative step size, test internal error.");
397             return 0;
398         }
399 
400         int sourceLen = cc.unicode.length();
401         int targetLen = cc.bytes.capacity() + 20;  // for BOM, and to let failures produce excess output
402         CharBuffer source = CharBuffer.wrap(cc.unicode.toCharArray());
403         ByteBuffer target = ByteBuffer.allocate(targetLen);
404         cc.fromUnicodeResult = null;
405         encoder.reset();
406 
407         int currentSourceLimit;
408         int currentTargetLimit;
409         if (step > 0) {
410             currentSourceLimit = Math.min(step, sourceLen);
411             currentTargetLimit = Math.min(step, targetLen);
412         } else {
413             currentSourceLimit = sourceLen;
414             currentTargetLimit = targetLen;
415         }
416 
417         CoderResult cr = null;
418 
419         for (;;) {
420             source.limit(currentSourceLimit);
421             target.limit(currentTargetLimit);
422 
423             cr = encoder.encode(source, target, currentSourceLimit == sourceLen);
424 
425             if (cr.isUnderflow()) {
426                 if (currentSourceLimit == sourceLen) {
427                     if (target.position() == cc.bytes.limit()) {
428                         // target contains the correct number of bytes
429                         break;
430                     }
431                     // Do a final flush for cleanup, then break out
432                     // Encode loop, exits with cr==underflow in normal operation.
433                     //target.limit(targetLen);
434                     target.limit(targetLen);
435                     cr = encoder.flush(target);
436                     if (cr.isUnderflow()) {
437                         // good
438                     } else if (cr.isOverflow()) {
439                         errln(cc.caseNrAsString() + " Flush is producing excessive output");
440                     } else {
441                         errln(cc.caseNrAsString() + " Flush operation failed.  CoderResult = \""
442                                 + cr.toString() + "\"");
443                     }
444                     break;
445                 }
446                 currentSourceLimit = Math.min(currentSourceLimit + step, sourceLen);
447             } else if (cr.isOverflow()) {
448                 if (currentTargetLimit == targetLen) {
449                     errln(cc.caseNrAsString() + " encode() is producing excessive output");
450                     break;
451                 }
452                 currentTargetLimit = Math.min(currentTargetLimit + step, targetLen);
453             } else {
454                 // check the error code to see if it matches cc.errorCode
455                 logln("Encoder returned an error code");
456                 logln("ErrorCode expected is: " + cc.outErrorCode);
457                 logln("Error Result is: " + cr.toString());
458                 break;
459             }
460 
461         }
462 
463         cc.fromUnicodeResult = target;
464         return target.position();
465     }
466 
checkFromUnicode(ConversionCase cc, int resultLength)467     private boolean checkFromUnicode(ConversionCase cc, int resultLength) {
468         return checkResultsFromUnicode(cc, cc.bytes, cc.fromUnicodeResult);
469     }
470 
471     // toUnicode test worker functions ----------------------------------------- ***
472 
TestToUnicode(DataMap testcase, int caseNr)473     private void TestToUnicode(DataMap testcase, int caseNr) {
474         // create Conversion case to store the test case data
475         ConversionCase cc = new ConversionCase();
476 
477         try {
478             // retrieve test case data
479             cc.caseNr = caseNr;
480             cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString();
481             cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary();
482             cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString();
483             cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector();
484             cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0;
485             cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0;
486             cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString();
487             cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString();
488 
489         } catch (Exception e) {
490             errln("Skipping test: error parsing conversion/toUnicode test case " + cc.caseNr);
491             return;
492         }
493 
494         // Android patch: Skip tests that fail with customized data.
495         String [] testsToSkip = {
496                 "ibm-1390,swaplfnl",
497         };
498         for (int i = 0; i < testsToSkip.length; i++) {
499             if (cc.charset.equals(testsToSkip[i])) {
500                 logln("");
501                 logln("Skipping: " + cc.charset);
502                 logln("...............................................");
503                 return;
504             }
505         }
506         // Android patch end.
507 
508         // ----for debugging only
509         logln("");
510         logln("TestToUnicode[" + caseNr + "] " + cc.charset + " ");
511         logln("Unicode:   " + hex(cc.unicode));
512         logln("Bytes:    " + printbytes(cc.bytes, cc.bytes.limit()));
513         ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes());
514         logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")");
515         logln("...............................................");
516 
517         // process the retrieved test data case
518         if (cc.offsets.length == 0) {
519             cc.offsets = null;
520         } else if (cc.offsets.length != cc.unicode.length()) {
521             errln("Skipping test: toUnicode[" + cc.caseNr + "] unicode["
522                     + cc.unicode.length() + "] and offsets["
523                     + cc.offsets.length + "] must have the same length");
524             return;
525         }
526         // check for the callback replacement value for unmappable
527         // characters or malformed errors
528         if (cc.cbopt.length() > 0) {
529             switch ((cc.cbopt).charAt(0)) {
530             case '?': // CALLBACK_SUBSTITUTE
531                 cc.cbErrorAction = CodingErrorAction.REPLACE;
532                 break;
533             case '0': // CALLBACK_SKIP
534                 cc.cbErrorAction = CodingErrorAction.IGNORE;
535                 break;
536             case '.': // CALLBACK_STOP
537                 cc.cbErrorAction = CodingErrorAction.REPORT;
538                 break;
539             case '&': // CALLBACK_ESCAPE
540                 cc.cbErrorAction = CodingErrorAction.REPORT;
541                 cc.cbDecoder = CharsetCallback.TO_U_CALLBACK_ESCAPE;
542                 break;
543             default:
544                 cc.cbErrorAction = null;
545                 break;
546             }
547         }
548         // check for any options for the callback value
549         cc.option = cc.cbErrorAction == null ? null : cc.cbopt.substring(1);
550         if (cc.option == null) {
551             cc.option = null;
552         }
553 
554         ToUnicodeCase(cc);
555 
556     }
557 
ToUnicodeCase(ConversionCase cc)558     private void ToUnicodeCase(ConversionCase cc) {
559 
560         // create converter for charset and decoder for each test case
561         CharsetProviderICU provider = new CharsetProviderICU();
562         CharsetDecoder decoder = null;
563         Charset charset = null;
564 
565         try {
566             // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
567             charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
568                     ? (Charset) provider.charsetForName(cc.charset.substring(1),
569                         "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
570                     : (Charset) provider.charsetForName(cc.charset);
571             if (charset != null) {
572                 decoder = charset.newDecoder();
573                 decoder.onMalformedInput(CodingErrorAction.REPLACE);
574                 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
575             }
576         } catch (Exception e) {
577             // TODO implement loading of test data.
578             decoder = null;
579         }
580         if (decoder == null) {
581             if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) {
582                 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time");
583             } else {
584                 errln(cc.charset + " was not found");
585             }
586             return;
587         }
588 
589         // set the callback for the decoder
590         if (cc.cbErrorAction != null) {
591             if (cc.cbDecoder != null) {
592                 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.malformedForLength(1), cc.cbDecoder, cc.option);
593                 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.unmappableForLength(1), cc.cbDecoder, cc.option);
594             } else {
595                 decoder.onMalformedInput(cc.cbErrorAction);
596                 decoder.onUnmappableCharacter(cc.cbErrorAction);
597             }
598 
599             // set the options (if any: SKIP_STOP_ON_ILLEGAL) for callback
600             if (cc.option.equals("i")) {
601                 decoder.onMalformedInput(CodingErrorAction.REPORT);
602             }
603 
604             // if callback action is replace, and there is a subchar
605             // replace the decoder's default replacement value
606             // if substring, skip test due to current api not supporting
607             // substring replacement
608             if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) {
609                 if (cc.cbopt.length() > 1) {
610                     if (cc.cbopt.charAt(1) == '=') {
611                         logln("Skipping test due to limitation in Java API - substitution string not supported");
612 
613                     } else {
614                         // // read NUL-separated subchar first, if any
615                         // copy the subchar from Latin-1 characters
616                         // start after the NUL
617                         if (cc.cbopt.charAt(1) == 0x00) {
618                             cc.cbopt = cc.cbopt.substring(2);
619 
620                             try {
621                                 decoder.replaceWith(cc.cbopt);
622                             } catch (Exception e) {
623                                 logln("Skipping test due to limitation in Java API - substitution character sequence size error");
624                             }
625                         }
626                     }
627                 }
628             }
629         }
630 
631         //      Check the step to unicode
632         boolean ok;
633         int resultLength;
634 
635         String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
636                 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
637         /* TODO: currently not supported test steps, getNext API is not supported for now
638          { "-1", "getNext" },
639          { "-2", "toU(bulk)+getNext" },
640          { "-3", "getNext+toU(bulk)" },
641          { "-4", "toU(1)+getNext" },
642          { "-5", "getNext+toU(1)" },
643          { "-12", "toU(5)+getNext" },
644          { "-13", "getNext+toU(5)" }};*/
645 
646         ok = true;
647         int step;
648         // testing by steps using the CoderResult cr = charset.decoder(in,out,flush) api
649         for (int i = 0; i < steps.length && ok; ++i) {
650             step = Integer.parseInt(steps[i][0]);
651 
652             if (step < 0 && !cc.finalFlush) {
653                 continue;
654             }
655             logln("Testing step:[" + step + "]");
656 
657             try {
658                 resultLength = stepToUnicode(cc, decoder, step);
659                 ok = checkToUnicode(cc, resultLength);
660             } catch (Exception ex) {
661                 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]");
662                 ex.printStackTrace(System.out);
663                 return;
664             }
665         }
666 
667         //testing the java's out = charset.decoder(in) api
668         while (ok && cc.finalFlush) {
669             logln("Testing java charset.decoder(in):");
670             cc.toUnicodeResult = null;
671             CharBuffer out = null;
672 
673             try {
674                 cc.bytes.rewind();
675                 out = decoder.decode(cc.bytes);
676                 out.position(out.limit());
677                 if (out.limit() < cc.unicode.length()) {
678                     int pos = out.position();
679                     char[] temp = out.array();
680                     out = CharBuffer.allocate(cc.bytes.limit());
681                     out.put(temp);
682                     out.position(pos);
683                     CoderResult cr = decoder.flush(out);
684                     if (cr.isOverflow()) {
685                         logln("Overflow error with flushing decodering");
686                     }
687                 }
688 
689                 cc.toUnicodeResult = out;
690 
691                 ok = checkToUnicode(cc, out.limit());
692                 if (!ok) {
693                     break;
694                 }
695             } catch (Exception e) {
696                 //check the error code to see if it matches cc.errorCode
697                 logln("Decoder returned an error code");
698                 logln("ErrorCode expected is: " + cc.outErrorCode);
699                 logln("Error Result is: " + e.toString());
700             }
701             break;
702         }
703 
704         return;
705     }
706 
707 
708 
709 
stepToUnicode(ConversionCase cc, CharsetDecoder decoder, int step)710     private int stepToUnicode(ConversionCase cc, CharsetDecoder decoder,
711             int step)
712 
713     {
714         ByteBuffer source;
715         CharBuffer target;
716         boolean flush = false;
717         int sourceLen;
718         source = cc.bytes;
719         sourceLen = cc.bytes.limit();
720         source.position(0);
721         target = CharBuffer.allocate(cc.unicode.length() + 4);
722         target.position(0);
723         cc.toUnicodeResult = null;
724         decoder.reset();
725 
726         if (step >= 0) {
727 
728             int iStep = step;
729             int oStep = step;
730 
731             for (;;) {
732 
733                 if (step != 0) {
734                     source.limit((iStep <= sourceLen) ? iStep : sourceLen);
735                     target.limit((oStep <= target.capacity()) ? oStep : target
736                             .capacity());
737                     flush = (cc.finalFlush && source.limit() == sourceLen);
738 
739                 } else {
740                     //bulk mode
741                     source.limit(sourceLen);
742                     target.limit(target.capacity());
743                     flush = cc.finalFlush;
744                 }
745                 // convert
746                 CoderResult cr = null;
747                 if (source.hasRemaining()) {
748 
749                     cr = decoder.decode(source, target, flush);
750                     // check pointers and errors
751                     if (cr.isOverflow()) {
752                         // the partial target is filled, set a new limit,
753                         oStep = (target.position() + step);
754                         target.limit((oStep < target.capacity()) ? oStep
755                                 : target.capacity());
756                         if (target.limit() > target.capacity()) {
757                             //target has reached its limit, an error occurred or test case has an error code
758                             //check error code
759                             logln("UnExpected error: Target Buffer is larger than capacity");
760                             break;
761                         }
762 
763                     } else if (cr.isError()) {
764                         //check the error code to see if it matches cc.errorCode
765                         logln("Decoder returned an error code");
766                         logln("ErrorCode expected is: " + cc.outErrorCode);
767                         logln("Error Result is: " + cr.toString());
768                         break;
769                     }
770 
771                 } else {
772                     if (source.limit() == sourceLen) {
773 
774                         cr = decoder.decode(source, target, true);
775 
776                         //due to limitation of the API we need to check for target limit for expected
777                         if (target.position() != cc.unicode.length()) {
778                             if (target.limit() != cc.unicode.length()) {
779                                 target.limit(cc.unicode.length());
780                             }
781                             cr = decoder.flush(target);
782                             if (cr.isError()) {
783                                 errln("Flush operation failed");
784                             }
785                         }
786                         break;
787                     }
788                 }
789                 iStep += step;
790 
791             }
792 
793         }// if(step ==0)
794 
795         //--------------------------------------------------------------------------
796         else /* step<0 */{
797             /*
798              * step==-1: call only ucnv_getNextUChar()
799              * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
800              *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
801              *   else give it at most (-step-2)/2 bytes
802              */
803 
804             for (;;) {
805                 // convert
806                 if ((step & 1) != 0 /* odd: -1, -3, -5, ... */) {
807 
808                     target.limit(target.position() < target.capacity() ? target
809                             .position() + 1 : target.capacity());
810 
811                     // decode behavior is return to output target 1 character
812                     CoderResult cr = null;
813 
814                     //similar to getNextUChar() , input is the whole string, while outputs only 1 character
815                     source.limit(sourceLen);
816                     while (target.position() != target.limit()
817                             && source.hasRemaining()) {
818                         cr = decoder.decode(source, target,
819                                 source.limit() == sourceLen);
820 
821                         if (cr.isOverflow()) {
822 
823                             if (target.limit() >= target.capacity()) {
824                                 // target has reached its limit, an error occurred
825                                 logln("UnExpected error: Target Buffer is larger than capacity");
826                                 break;
827                             } else {
828                                 //1 character has been consumed
829                                 target.limit(target.position() + 1);
830                                 break;
831                             }
832                         } else if (cr.isError()) {
833                             logln("Decoder returned an error code");
834                             logln("ErrorCode expected is: " + cc.outErrorCode);
835                             logln("Error Result is: " + cr.toString());
836 
837                             cc.toUnicodeResult = target;
838                             return target.position();
839                         }
840 
841                         else {
842                             // one character has been consumed
843                             if (target.limit() == target.position()) {
844                                 target.limit(target.position() + 1);
845                                 break;
846                             }
847                         }
848 
849                     }
850                     if (source.position() == sourceLen) {
851 
852                         // due to limitation of the API we need to check
853                         // for target limit for expected
854                         cr = decoder.decode(source, target, true);
855                         if (target.position() != cc.unicode.length()) {
856 
857                             target.limit(cc.unicode.length());
858                             cr = decoder.flush(target);
859                             if (cr.isError()) {
860                                 errln("Flush operation failed");
861                             }
862                         }
863                         break;
864                     }
865                     // alternate between -n-1 and -n but leave -1 alone
866                     if (step < -1) {
867                         ++step;
868                     }
869                 } else {/* step is even */
870                     // allow only one UChar output
871 
872                     target.limit(target.position() < target.capacity() ? target
873                             .position() + 1 : target.capacity());
874                     if (step == -2) {
875                         source.limit(sourceLen);
876                     } else {
877                         source.limit(source.position() + (-step - 2) / 2);
878                         if (source.limit() > sourceLen) {
879                             source.limit(sourceLen);
880                         }
881                     }
882                     CoderResult cr = decoder.decode(source, target, source
883                             .limit() == sourceLen);
884                     // check pointers and errors
885                     if (cr.isOverflow()) {
886                         // one character has been consumed
887                         if (target.limit() >= target.capacity()) {
888                             // target has reached its limit, an error occurred
889                             logln("Unexpected error: Target Buffer is larger than capacity");
890                             break;
891                         }
892                     } else if (cr.isError()) {
893                         logln("Decoder returned an error code");
894                         logln("ErrorCode expected is: " + cc.outErrorCode);
895                         logln("Error Result is: " + cr.toString());
896                         break;
897                     }
898 
899                     --step;
900                 }
901             }
902         }
903 
904         //--------------------------------------------------------------------------
905 
906         cc.toUnicodeResult = target;
907         return target.position();
908     }
909 
910 
911 
checkToUnicode(ConversionCase cc, int resultLength)912     private boolean checkToUnicode(ConversionCase cc, int resultLength) {
913         return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult);
914     }
915 
916 
TestGetUnicodeSet(DataMap testcase)917     private void TestGetUnicodeSet(DataMap testcase) {
918         /*
919          * charset - will be opened, and ucnv_getUnicodeSet() called on it //
920          * map - set of code points and strings that must be in the returned set //
921          * mapnot - set of code points and strings that must *not* be in the //
922          * returned set // which - numeric UConverterUnicodeSet value Headers {
923          * "charset", "map", "mapnot", "which" }
924          */
925 
926 
927         // retrieve test case data
928         ConversionCase cc = new ConversionCase();
929         CharsetProviderICU provider = new CharsetProviderICU();
930         CharsetICU charset  ;
931 
932 
933         UnicodeSet mapset = new UnicodeSet();
934         UnicodeSet mapnotset = new UnicodeSet();
935         UnicodeSet unicodeset = new UnicodeSet();
936         String ellipsis = "0x2e";
937         cc.charset = ((ICUResourceBundle) testcase.getObject("charset"))
938                 .getString();
939         cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString();
940         cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot"))
941                 .getString();
942 
943 
944         cc.which = ((ICUResourceBundle) testcase.getObject("which")).getInt(); // only checking for ROUNDTRIP_SET
945 
946         // Android patch: Skip tests that fail with customized data.
947         String [] testsToSkip = {
948                 "HZ",
949                 "ibm-1390",
950                 "ibm-16684",
951                 "ibm-25546",
952                 "ibm-971",
953                 "ISO-2022-CN",
954                 "ISO-2022-JP",
955                 "ISO-2022-JP-2",
956                 "ISO-2022-KR",
957                 "JIS7",
958         };
959         for (int i = 0; i < testsToSkip.length; i++) {
960             if (cc.charset.equals(testsToSkip[i])) {
961                 logln("");
962                 logln("Skipping: " + cc.charset);
963                 logln("...............................................");
964                 return;
965             }
966         }
967         // Android patch end.
968 
969         // ----for debugging only
970         logln("");
971         logln("TestGetUnicodeSet[" + cc.charset + "] ");
972         logln("...............................................");
973 
974         try{
975             // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
976             charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
977                     ? (CharsetICU) provider.charsetForName(cc.charset.substring(1),
978                             "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
979                             : (CharsetICU) provider.charsetForName(cc.charset);
980 
981                     //checking for converter that are not supported at this point
982                     try{
983                         if(charset==null ||
984                                 charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" ||
985                                 charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
986                                 charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" ||
987                                 charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){
988                             logln("Converter not supported at this point :" + cc.charset);
989                             return;
990                         }
991 
992                         if(cc.which==1){
993                             logln("Fallback set not supported at this point for converter : "+charset.displayName());
994                             return;
995                         }
996 
997                     }catch(Exception e){
998                         return;
999                     }
1000 
1001                     mapset.clear();
1002                     mapnotset.clear();
1003 
1004                     mapset.applyPattern(cc.map,false);
1005                     mapnotset.applyPattern(cc.mapnot,false);
1006 
1007                     charset.getUnicodeSet(unicodeset, cc.which);
1008                     UnicodeSet diffset = new UnicodeSet();
1009 
1010                     //are there items that must be in unicodeset but are not?
1011                     (diffset = mapset).removeAll(unicodeset);
1012                     if(!diffset.isEmpty()){
1013                         StringBuffer s = new StringBuffer(diffset.toPattern(true));
1014                         if(s.length()>100){
1015                             s.replace(0, 0x7fffffff, ellipsis);
1016                         }
1017                         errln("error in missing items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString());
1018                     }
1019 
1020                     //are the items that must not be in unicodeset but are?
1021                     (diffset=mapnotset).retainAll(unicodeset);
1022                     if(!diffset.isEmpty()){
1023                         StringBuffer s = new StringBuffer(diffset.toPattern(true));
1024                         if(s.length()>100){
1025                             s.replace(0, 0x7fffffff, ellipsis);
1026                         }
1027                         errln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString());
1028                     }
1029         } catch (Exception e) {
1030             errln("getUnicodeSet returned an error code");
1031             errln("ErrorCode expected is: " + cc.outErrorCode);
1032             errln("Error Result is: " + e.toString());
1033             return;
1034         }
1035     }
1036 
1037     /**
1038      * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
1039      * start of the stream for example U+FEFF (the Unicode BOM/signature
1040      * character) that can be ignored.
1041      *
1042      * Detects Unicode signature byte sequences at the start of the byte stream
1043      * and returns number of bytes of the BOM of the indicated Unicode charset.
1044      * 0 is returned when no Unicode signature is recognized.
1045      *
1046      */
1047 
detectUnicodeSignature(ByteBuffer source)1048     private String detectUnicodeSignature(ByteBuffer source) {
1049         int signatureLength = 0; // number of bytes of the signature
1050         final int SIG_MAX_LEN = 5;
1051         String sigUniCharset = null; // states what unicode charset is the BOM
1052         int i = 0;
1053 
1054         /*
1055          * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
1056          * don't misdetect something
1057          */
1058         byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
1059                 (byte) 0xa5 };
1060 
1061         while (i < source.limit() && i < SIG_MAX_LEN) {
1062             start[i] = source.get(i);
1063             i++;
1064         }
1065 
1066         if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
1067             signatureLength = 2;
1068             sigUniCharset = "UTF-16BE";
1069             source.position(signatureLength);
1070             return sigUniCharset;
1071         } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
1072             if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
1073                 signatureLength = 4;
1074                 sigUniCharset = "UTF-32LE";
1075                 source.position(signatureLength);
1076                 return sigUniCharset;
1077             } else {
1078                 signatureLength = 2;
1079                 sigUniCharset = "UTF-16LE";
1080                 source.position(signatureLength);
1081                 return sigUniCharset;
1082             }
1083         } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
1084                 && start[2] == (byte) 0xBF) {
1085             signatureLength = 3;
1086             sigUniCharset = "UTF-8";
1087             source.position(signatureLength);
1088             return sigUniCharset;
1089         } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
1090                 && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
1091             signatureLength = 4;
1092             sigUniCharset = "UTF-32BE";
1093             source.position(signatureLength);
1094             return sigUniCharset;
1095         } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
1096                 && start[2] == (byte) 0xFF) {
1097             signatureLength = 3;
1098             sigUniCharset = "SCSU";
1099             source.position(signatureLength);
1100             return sigUniCharset;
1101         } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
1102                 && start[2] == (byte) 0x28) {
1103             signatureLength = 3;
1104             sigUniCharset = "BOCU-1";
1105             source.position(signatureLength);
1106             return sigUniCharset;
1107         } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
1108                 && start[2] == (byte) 0x76) {
1109 
1110             if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
1111                 signatureLength = 5;
1112                 sigUniCharset = "UTF-7";
1113                 source.position(signatureLength);
1114                 return sigUniCharset;
1115             } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
1116                     || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
1117                 signatureLength = 4;
1118                 sigUniCharset = "UTF-7";
1119                 source.position(signatureLength);
1120                 return sigUniCharset;
1121             }
1122         } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
1123                 && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
1124             signatureLength = 4;
1125             sigUniCharset = "UTF-EBCDIC";
1126             source.position(signatureLength);
1127             return sigUniCharset;
1128         }
1129 
1130         /* no known Unicode signature byte sequence recognized */
1131         return null;
1132     }
1133 
printbytes(ByteBuffer buf, int pos)1134     String printbytes(ByteBuffer buf, int pos) {
1135         int cur = buf.position();
1136         String res = " (" + pos + ")==[";
1137         for (int i = 0; i < pos; i++) {
1138             res += "(" + i + ")" + hex(buf.get(i) & 0xff).substring(2) + " ";
1139         }
1140         buf.position(cur);
1141         return res + "]";
1142     }
1143 
printchars(CharBuffer buf, int pos)1144     String printchars(CharBuffer buf, int pos) {
1145         int cur = buf.position();
1146         String res = " (" + pos + ")==[";
1147         for (int i = 0; i < pos; i++) {
1148             res += "(" + i + ")" + hex(buf.get(i)) + " ";
1149         }
1150         buf.position(cur);
1151         return res + "]";
1152     }
1153 
checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected, ByteBuffer output)1154     private boolean checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected,
1155             ByteBuffer output) {
1156 
1157         boolean res = true;
1158         expected.rewind();
1159         output.limit(output.position());
1160         output.rewind();
1161 
1162         // remove any BOM signature before checking
1163         if (!cc.charset.contains("UnicodeLittle") && !cc.charset.contains("UnicodeBig")) {
1164             detectUnicodeSignature(output); // sets the position to after the BOM
1165             output = output.slice(); // removes anything before the current position
1166         }
1167 
1168         if (output.limit() != expected.limit()) {
1169             errln("Test failed: output length does not match expected for charset: " + cc.charset
1170                     + " [" + cc.caseNr + "]");
1171             res = false;
1172         } else {
1173             while (output.hasRemaining()) {
1174                 if (output.get() != expected.get()) {
1175                     errln("Test failed: output does not match expected for charset: " + cc.charset
1176                             + " [" + cc.caseNr + "]");
1177                     res = false;
1178                     break;
1179                 }
1180             }
1181         }
1182 
1183         if (res) {
1184             logln("[" + cc.caseNr + "]:" + cc.charset);
1185             logln("Input:       " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length()));
1186             logln("Output:      " + printbytes(output, output.limit()));
1187             logln("Expected:    " + printbytes(expected, expected.limit()));
1188             logln("Passed");
1189         }
1190         else {
1191             errln("[" + cc.caseNr + "]:" + cc.charset);
1192             errln("Input:       " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length()));
1193             errln("Output:      " + printbytes(output, output.limit()));
1194             errln("Expected:    " + printbytes(expected, expected.limit()));
1195             errln("Failed");
1196         }
1197         return res;
1198     }
1199 
checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output)1200     private boolean checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output) {
1201 
1202         boolean res = true;
1203         output.limit(output.position());
1204         output.rewind();
1205 
1206         // test to see if the conversion matches actual results
1207         if (output.limit() != expected.length()) {
1208             errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
1209             res = false;
1210         } else {
1211             for (int i = 0; i < expected.length(); i++) {
1212                 if (output.get(i) != expected.charAt(i)) {
1213                     errln("Test failed: output does not match expected for charset: " + cc.charset
1214                             + " [" + cc.caseNr + "]");
1215                     res = false;
1216                     break;
1217                 }
1218             }
1219         }
1220 
1221         if (res) {
1222             logln("[" + cc.caseNr + "]:" + cc.charset);
1223             logln("Input:       " + printbytes(cc.bytes, cc.bytes.limit()));
1224             logln("Output:      " + printchars(output, output.limit()));
1225             logln("Expected:    " + printchars(CharBuffer.wrap(expected), expected.length()));
1226             logln("Passed");
1227         } else {
1228             errln("[" + cc.caseNr + "]:" + cc.charset);
1229             errln("Input:       " + printbytes(cc.bytes, cc.bytes.limit()));
1230             errln("Output:      " + printchars(output, output.limit()));
1231             errln("Expected:    " + printchars(CharBuffer.wrap(expected), expected.length()));
1232             errln("Failed");
1233         }
1234         return res;
1235     }
1236 
toByteArray(String str)1237     private byte[] toByteArray(String str) {
1238         byte[] ret = new byte[str.length()];
1239         for (int i = 0; i < ret.length; i++) {
1240             char ch = str.charAt(i);
1241             if (ch <= 0xFF) {
1242                 ret[i] = (byte) ch;
1243             } else {
1244                 throw new IllegalArgumentException(" byte value out of range: " + ch);
1245             }
1246         }
1247         return ret;
1248     }
1249 }
1250