1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2002-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.dev.test.charset; 11 12 import java.nio.ByteBuffer; 13 import java.nio.CharBuffer; 14 import java.nio.charset.Charset; 15 import java.nio.charset.CharsetDecoder; 16 import java.nio.charset.CharsetEncoder; 17 import java.nio.charset.CoderResult; 18 import java.nio.charset.CodingErrorAction; 19 import java.util.Iterator; 20 import java.util.List; 21 22 import org.junit.Test; 23 import org.junit.runner.RunWith; 24 25 import com.ibm.icu.charset.CharsetCallback; 26 import com.ibm.icu.charset.CharsetDecoderICU; 27 import com.ibm.icu.charset.CharsetEncoderICU; 28 import com.ibm.icu.charset.CharsetICU; 29 import com.ibm.icu.charset.CharsetProviderICU; 30 import com.ibm.icu.dev.test.ModuleTest; 31 import com.ibm.icu.dev.test.ModuleTest.TestDataPair; 32 import com.ibm.icu.dev.test.TestDataModule.DataMap; 33 import com.ibm.icu.dev.test.TestDataModule.TestData; 34 import com.ibm.icu.dev.test.TestFmwk; 35 import com.ibm.icu.impl.ICUResourceBundle; 36 import com.ibm.icu.text.UnicodeSet; 37 38 import junitparams.JUnitParamsRunner; 39 import junitparams.Parameters; 40 41 /** 42 * This maps to convtest.c which tests the test file for data-driven conversion tests. 43 * 44 */ 45 @RunWith(JUnitParamsRunner.class) 46 public class TestConversion extends TestFmwk { 47 /** 48 * This maps to the C struct of conversion case in convtest.h that stores the 49 * data for a conversion test 50 * 51 */ 52 private class ConversionCase { 53 int caseNr; // testcase index 54 String option = null; // callback options 55 CodingErrorAction cbErrorAction = null; // callback action type 56 CharBuffer toUnicodeResult = null; 57 ByteBuffer fromUnicodeResult = null; 58 59 // data retrieved from a test case conversion.txt 60 String charset; // charset 61 String unicode; // unicode string 62 ByteBuffer bytes; // byte 63 int[] offsets; // offsets 64 boolean finalFlush; // flush 65 boolean fallbacks; // fallback 66 String outErrorCode; // errorCode 67 String cbopt; // callback 68 69 // TestGetUnicodeSet variables 70 String map; 71 String mapnot; 72 int which; 73 74 // CharsetCallback encoder and decoder 75 CharsetCallback.Decoder cbDecoder = null; 76 CharsetCallback.Encoder cbEncoder = null; 77 caseNrAsString()78 String caseNrAsString() { 79 return "[" + caseNr + "]"; 80 } 81 } 82 83 /* In the data-driven conversion test, converters that are not available in 84 * ICU4J are marked with the following leading symbol. 85 */ 86 private static final char UNSUPPORTED_CHARSET_SYMBOL = '+'; 87 88 // public methods -------------------------------------------------------- 89 TestConversion()90 public TestConversion() { 91 } 92 93 @SuppressWarnings("unused") getTestData()94 private List<TestDataPair> getTestData() throws Exception { 95 return ModuleTest.getTestData("com/ibm/icu/dev/data/testdata/", "conversion"); 96 } 97 98 /* 99 * This method maps to the convtest.cpp runIndexedTest() method to run each 100 * type of conversion. 101 */ 102 @Test 103 @Parameters(method="getTestData") conversionTest(TestDataPair pair)104 public void conversionTest(TestDataPair pair) { 105 TestData td = pair.td; 106 //DataMap settings = pair.dm; 107 108 int testFromUnicode = 0; 109 int testToUnicode = 0; 110 String testName = td.getName().toString(); 111 112 // Iterate through and get each of the test case to process 113 for (Iterator iter = td.getDataIterator(); iter.hasNext();) { 114 DataMap testcase = (DataMap) iter.next(); 115 116 if (testName.equalsIgnoreCase("toUnicode")) { 117 TestToUnicode(testcase, testToUnicode); 118 testToUnicode++; 119 120 } else if (testName.equalsIgnoreCase("fromUnicode")) { 121 TestFromUnicode(testcase, testFromUnicode); 122 testFromUnicode++; 123 } else if (testName.equalsIgnoreCase("getUnicodeSet")) { 124 TestGetUnicodeSet(testcase); 125 } else { 126 warnln("Could not load the test cases for conversion"); 127 // continue; 128 } 129 } 130 } 131 132 // private methods ------------------------------------------------------- 133 134 135 // fromUnicode test worker functions --------------------------------------- TestFromUnicode(DataMap testcase, int caseNr)136 private void TestFromUnicode(DataMap testcase, int caseNr) { 137 138 ConversionCase cc = new ConversionCase(); 139 140 try { 141 // retrieve test case data 142 cc.caseNr = caseNr; 143 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString(); 144 cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString(); 145 cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary(); 146 cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector(); 147 cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0; 148 cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0; 149 cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString(); 150 cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString(); 151 152 } catch (Exception e) { 153 errln("Skipping test:"); 154 errln("error parsing conversion/toUnicode test case " + cc.caseNr); 155 return; 156 } 157 158 /* 159 * Skip the following data driven converter tests. 160 * These tests were added to the data driven conversion test in ICU 161 * to test direct-from-UTF-8 m:n Unicode:charset conversion. 162 * This feature is not in ICU4J. 163 * See #9601 164 */ 165 // Android patch: Skip tests that fail with customized data. 166 String [] testsToSkip = { 167 "*test2", 168 "EUC-TW", 169 "gb18030", 170 "ibm-1386", 171 "ibm-1390", 172 "ibm-1390,swaplfnl", 173 "ibm-1399", 174 "ibm-16684", 175 "ibm-25546", 176 "ibm-930", 177 "ibm-943", 178 "ibm-970", 179 "ibm-971", 180 "IBM-eucJP", 181 "iso-2022-cn", 182 "iso-2022-jp", 183 "ISO-2022-JP-2", 184 "iso-2022-kr", 185 "ISO-2022-KR", 186 "JIS", 187 "JIS7", 188 "JIS8", 189 "lmbcs", 190 "windows-936", 191 "x11-compound-text", 192 }; 193 // Android patch end. 194 for (int i = 0; i < testsToSkip.length; i++) { 195 if (cc.charset.equals(testsToSkip[i])) { 196 logln(""); 197 logln("Skipping: " + cc.charset); 198 logln("..............................................."); 199 return; 200 } 201 } 202 203 // ----for debugging only 204 logln(""); 205 logln("TestFromUnicode[" + caseNr + "] " + cc.charset + " "); 206 logln("Unicode: " + cc.unicode); 207 logln("Bytes: " + printbytes(cc.bytes, cc.bytes.limit())); 208 ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes()); 209 logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")"); 210 logln("..............................................."); 211 212 // process the retrieved test data case 213 if (cc.offsets.length == 0) { 214 cc.offsets = null; 215 } else if (cc.offsets.length != cc.bytes.limit()) { 216 errln("fromUnicode[" + cc.caseNr + "] bytes[" + cc.bytes 217 + "] and offsets[" + cc.offsets.length 218 + "] must have the same length"); 219 return; 220 } 221 222 // check the callback replacement value 223 if (cc.cbopt.length() > 0) { 224 225 switch ((cc.cbopt).charAt(0)) { 226 case '?': 227 cc.cbErrorAction = CodingErrorAction.REPLACE; 228 break; 229 case '0': 230 cc.cbErrorAction = CodingErrorAction.IGNORE; 231 break; 232 case '.': 233 cc.cbErrorAction = CodingErrorAction.REPORT; 234 break; 235 case '&': 236 cc.cbErrorAction = CodingErrorAction.REPLACE; 237 cc.cbEncoder = CharsetCallback.FROM_U_CALLBACK_ESCAPE; 238 break; 239 default: 240 cc.cbErrorAction = null; 241 break; 242 } 243 244 // check for any options for the callback value -- 245 cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt 246 .substring(1); 247 if (cc.option == null) { 248 cc.option = null; 249 } 250 } 251 FromUnicodeCase(cc); 252 } 253 254 FromUnicodeCase(ConversionCase cc)255 private void FromUnicodeCase(ConversionCase cc) { 256 // create charset encoder for conversion test 257 CharsetProviderICU provider = new CharsetProviderICU(); 258 CharsetEncoder encoder = null; 259 Charset charset = null; 260 try { 261 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata 262 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*') 263 ? (Charset) provider.charsetForName(cc.charset.substring(1), 264 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) 265 : (Charset) provider.charsetForName(cc.charset); 266 if (charset != null) { 267 encoder = charset.newEncoder(); 268 encoder.onMalformedInput(CodingErrorAction.REPLACE); 269 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 270 if (encoder instanceof CharsetEncoderICU) { 271 ((CharsetEncoderICU)encoder).setFallbackUsed(cc.fallbacks); 272 if (((CharsetEncoderICU)encoder).isFallbackUsed() != cc.fallbacks) { 273 errln("Fallback could not be set for " + cc.charset); 274 } 275 } 276 } 277 } catch (Exception e) { 278 encoder = null; 279 } 280 if (encoder == null) { 281 if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) { 282 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time"); 283 } else { 284 errln(cc.charset + " was not found"); 285 } 286 return; 287 } 288 289 // set the callback for the encoder 290 if (cc.cbErrorAction != null) { 291 if (cc.cbEncoder != null) { 292 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option); 293 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.unmappableForLength(1), cc.cbEncoder, cc.option); 294 } else { 295 encoder.onUnmappableCharacter(cc.cbErrorAction); 296 encoder.onMalformedInput(cc.cbErrorAction); 297 } 298 299 // if action has an option, put in the option for the case 300 if (cc.option.equals("i")) { 301 encoder.onMalformedInput(CodingErrorAction.REPORT); 302 } 303 304 // if callback action is replace, 305 // and there is a subchar 306 // replace the decoder's default replacement value 307 // if substring, skip test due to current api not supporting 308 // substring 309 if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) { 310 if (cc.cbopt.length() > 1) { 311 if (cc.cbopt.length() > 1 && cc.cbopt.charAt(1) == '=') { 312 logln("Skipping test due to limitation in Java API - substitution string not supported"); 313 return; 314 } else { 315 // // read NUL-separated subchar first, if any 316 // copy the subchar from Latin-1 characters 317 // start after the NUL 318 if (cc.cbopt.charAt(1) == 0x00) { 319 cc.cbopt = cc.cbopt.substring(2); 320 321 try { 322 encoder.replaceWith(toByteArray(cc.cbopt)); 323 } catch (Exception e) { 324 logln("Skipping test due to limitation in Java API - substitution character sequence size error"); 325 return; 326 } 327 } 328 } 329 } 330 } 331 } 332 333 // do charset encoding from unicode 334 335 // testing by steps using charset.encoder(in,out,flush) 336 int resultLength; 337 boolean ok; 338 String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked 339 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } }; 340 int i, step; 341 342 ok = true; 343 344 for (i = 0; i < steps.length && ok; ++i) { 345 step = Integer.parseInt(steps[i][0]); 346 347 logln("Testing step:[" + step + "]"); 348 try { 349 resultLength = stepFromUnicode(cc, encoder, step); 350 ok = checkFromUnicode(cc, resultLength); 351 } catch (Exception ex) { 352 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]"); 353 ex.printStackTrace(System.out); 354 return; 355 } 356 357 } 358 // testing by whole buffer using out = charset.encoder(in) 359 while (ok && cc.finalFlush) { 360 logln("Testing java API charset.encoder(in):"); 361 cc.fromUnicodeResult = null; 362 ByteBuffer out = null; 363 364 try { 365 out = encoder.encode(CharBuffer.wrap(cc.unicode.toCharArray())); 366 out.position(out.limit()); 367 if (out.limit() != out.capacity() || cc.finalFlush) { 368 int pos = out.position(); 369 byte[] temp = out.array(); 370 out = ByteBuffer.allocate(temp.length * 4); 371 out.put(temp); 372 out.position(pos); 373 CoderResult cr = encoder.flush(out); 374 if (cr.isOverflow()) { 375 logln("Overflow error with flushing encoder"); 376 } 377 } 378 cc.fromUnicodeResult = out; 379 380 ok = checkFromUnicode(cc, out.limit()); 381 if (!ok) { 382 break; 383 } 384 } catch (Exception e) { 385 //check the error code to see if it matches cc.errorCode 386 logln("Encoder returned an error code"); 387 logln("ErrorCode expected is: " + cc.outErrorCode); 388 logln("Error Result is: " + e.toString()); 389 } 390 break; 391 } 392 } 393 stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step)394 private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) { 395 if (step < 0) { 396 errln("Negative step size, test internal error."); 397 return 0; 398 } 399 400 int sourceLen = cc.unicode.length(); 401 int targetLen = cc.bytes.capacity() + 20; // for BOM, and to let failures produce excess output 402 CharBuffer source = CharBuffer.wrap(cc.unicode.toCharArray()); 403 ByteBuffer target = ByteBuffer.allocate(targetLen); 404 cc.fromUnicodeResult = null; 405 encoder.reset(); 406 407 int currentSourceLimit; 408 int currentTargetLimit; 409 if (step > 0) { 410 currentSourceLimit = Math.min(step, sourceLen); 411 currentTargetLimit = Math.min(step, targetLen); 412 } else { 413 currentSourceLimit = sourceLen; 414 currentTargetLimit = targetLen; 415 } 416 417 CoderResult cr = null; 418 419 for (;;) { 420 source.limit(currentSourceLimit); 421 target.limit(currentTargetLimit); 422 423 cr = encoder.encode(source, target, currentSourceLimit == sourceLen); 424 425 if (cr.isUnderflow()) { 426 if (currentSourceLimit == sourceLen) { 427 if (target.position() == cc.bytes.limit()) { 428 // target contains the correct number of bytes 429 break; 430 } 431 // Do a final flush for cleanup, then break out 432 // Encode loop, exits with cr==underflow in normal operation. 433 //target.limit(targetLen); 434 target.limit(targetLen); 435 cr = encoder.flush(target); 436 if (cr.isUnderflow()) { 437 // good 438 } else if (cr.isOverflow()) { 439 errln(cc.caseNrAsString() + " Flush is producing excessive output"); 440 } else { 441 errln(cc.caseNrAsString() + " Flush operation failed. CoderResult = \"" 442 + cr.toString() + "\""); 443 } 444 break; 445 } 446 currentSourceLimit = Math.min(currentSourceLimit + step, sourceLen); 447 } else if (cr.isOverflow()) { 448 if (currentTargetLimit == targetLen) { 449 errln(cc.caseNrAsString() + " encode() is producing excessive output"); 450 break; 451 } 452 currentTargetLimit = Math.min(currentTargetLimit + step, targetLen); 453 } else { 454 // check the error code to see if it matches cc.errorCode 455 logln("Encoder returned an error code"); 456 logln("ErrorCode expected is: " + cc.outErrorCode); 457 logln("Error Result is: " + cr.toString()); 458 break; 459 } 460 461 } 462 463 cc.fromUnicodeResult = target; 464 return target.position(); 465 } 466 checkFromUnicode(ConversionCase cc, int resultLength)467 private boolean checkFromUnicode(ConversionCase cc, int resultLength) { 468 return checkResultsFromUnicode(cc, cc.bytes, cc.fromUnicodeResult); 469 } 470 471 // toUnicode test worker functions ----------------------------------------- *** 472 TestToUnicode(DataMap testcase, int caseNr)473 private void TestToUnicode(DataMap testcase, int caseNr) { 474 // create Conversion case to store the test case data 475 ConversionCase cc = new ConversionCase(); 476 477 try { 478 // retrieve test case data 479 cc.caseNr = caseNr; 480 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString(); 481 cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary(); 482 cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString(); 483 cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector(); 484 cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0; 485 cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0; 486 cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString(); 487 cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString(); 488 489 } catch (Exception e) { 490 errln("Skipping test: error parsing conversion/toUnicode test case " + cc.caseNr); 491 return; 492 } 493 494 // Android patch: Skip tests that fail with customized data. 495 String [] testsToSkip = { 496 "ibm-1390,swaplfnl", 497 }; 498 for (int i = 0; i < testsToSkip.length; i++) { 499 if (cc.charset.equals(testsToSkip[i])) { 500 logln(""); 501 logln("Skipping: " + cc.charset); 502 logln("..............................................."); 503 return; 504 } 505 } 506 // Android patch end. 507 508 // ----for debugging only 509 logln(""); 510 logln("TestToUnicode[" + caseNr + "] " + cc.charset + " "); 511 logln("Unicode: " + hex(cc.unicode)); 512 logln("Bytes: " + printbytes(cc.bytes, cc.bytes.limit())); 513 ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes()); 514 logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")"); 515 logln("..............................................."); 516 517 // process the retrieved test data case 518 if (cc.offsets.length == 0) { 519 cc.offsets = null; 520 } else if (cc.offsets.length != cc.unicode.length()) { 521 errln("Skipping test: toUnicode[" + cc.caseNr + "] unicode[" 522 + cc.unicode.length() + "] and offsets[" 523 + cc.offsets.length + "] must have the same length"); 524 return; 525 } 526 // check for the callback replacement value for unmappable 527 // characters or malformed errors 528 if (cc.cbopt.length() > 0) { 529 switch ((cc.cbopt).charAt(0)) { 530 case '?': // CALLBACK_SUBSTITUTE 531 cc.cbErrorAction = CodingErrorAction.REPLACE; 532 break; 533 case '0': // CALLBACK_SKIP 534 cc.cbErrorAction = CodingErrorAction.IGNORE; 535 break; 536 case '.': // CALLBACK_STOP 537 cc.cbErrorAction = CodingErrorAction.REPORT; 538 break; 539 case '&': // CALLBACK_ESCAPE 540 cc.cbErrorAction = CodingErrorAction.REPORT; 541 cc.cbDecoder = CharsetCallback.TO_U_CALLBACK_ESCAPE; 542 break; 543 default: 544 cc.cbErrorAction = null; 545 break; 546 } 547 } 548 // check for any options for the callback value 549 cc.option = cc.cbErrorAction == null ? null : cc.cbopt.substring(1); 550 if (cc.option == null) { 551 cc.option = null; 552 } 553 554 ToUnicodeCase(cc); 555 556 } 557 ToUnicodeCase(ConversionCase cc)558 private void ToUnicodeCase(ConversionCase cc) { 559 560 // create converter for charset and decoder for each test case 561 CharsetProviderICU provider = new CharsetProviderICU(); 562 CharsetDecoder decoder = null; 563 Charset charset = null; 564 565 try { 566 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata 567 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*') 568 ? (Charset) provider.charsetForName(cc.charset.substring(1), 569 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) 570 : (Charset) provider.charsetForName(cc.charset); 571 if (charset != null) { 572 decoder = charset.newDecoder(); 573 decoder.onMalformedInput(CodingErrorAction.REPLACE); 574 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 575 } 576 } catch (Exception e) { 577 // TODO implement loading of test data. 578 decoder = null; 579 } 580 if (decoder == null) { 581 if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) { 582 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time"); 583 } else { 584 errln(cc.charset + " was not found"); 585 } 586 return; 587 } 588 589 // set the callback for the decoder 590 if (cc.cbErrorAction != null) { 591 if (cc.cbDecoder != null) { 592 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.malformedForLength(1), cc.cbDecoder, cc.option); 593 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.unmappableForLength(1), cc.cbDecoder, cc.option); 594 } else { 595 decoder.onMalformedInput(cc.cbErrorAction); 596 decoder.onUnmappableCharacter(cc.cbErrorAction); 597 } 598 599 // set the options (if any: SKIP_STOP_ON_ILLEGAL) for callback 600 if (cc.option.equals("i")) { 601 decoder.onMalformedInput(CodingErrorAction.REPORT); 602 } 603 604 // if callback action is replace, and there is a subchar 605 // replace the decoder's default replacement value 606 // if substring, skip test due to current api not supporting 607 // substring replacement 608 if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) { 609 if (cc.cbopt.length() > 1) { 610 if (cc.cbopt.charAt(1) == '=') { 611 logln("Skipping test due to limitation in Java API - substitution string not supported"); 612 613 } else { 614 // // read NUL-separated subchar first, if any 615 // copy the subchar from Latin-1 characters 616 // start after the NUL 617 if (cc.cbopt.charAt(1) == 0x00) { 618 cc.cbopt = cc.cbopt.substring(2); 619 620 try { 621 decoder.replaceWith(cc.cbopt); 622 } catch (Exception e) { 623 logln("Skipping test due to limitation in Java API - substitution character sequence size error"); 624 } 625 } 626 } 627 } 628 } 629 } 630 631 // Check the step to unicode 632 boolean ok; 633 int resultLength; 634 635 String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked 636 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } }; 637 /* TODO: currently not supported test steps, getNext API is not supported for now 638 { "-1", "getNext" }, 639 { "-2", "toU(bulk)+getNext" }, 640 { "-3", "getNext+toU(bulk)" }, 641 { "-4", "toU(1)+getNext" }, 642 { "-5", "getNext+toU(1)" }, 643 { "-12", "toU(5)+getNext" }, 644 { "-13", "getNext+toU(5)" }};*/ 645 646 ok = true; 647 int step; 648 // testing by steps using the CoderResult cr = charset.decoder(in,out,flush) api 649 for (int i = 0; i < steps.length && ok; ++i) { 650 step = Integer.parseInt(steps[i][0]); 651 652 if (step < 0 && !cc.finalFlush) { 653 continue; 654 } 655 logln("Testing step:[" + step + "]"); 656 657 try { 658 resultLength = stepToUnicode(cc, decoder, step); 659 ok = checkToUnicode(cc, resultLength); 660 } catch (Exception ex) { 661 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]"); 662 ex.printStackTrace(System.out); 663 return; 664 } 665 } 666 667 //testing the java's out = charset.decoder(in) api 668 while (ok && cc.finalFlush) { 669 logln("Testing java charset.decoder(in):"); 670 cc.toUnicodeResult = null; 671 CharBuffer out = null; 672 673 try { 674 cc.bytes.rewind(); 675 out = decoder.decode(cc.bytes); 676 out.position(out.limit()); 677 if (out.limit() < cc.unicode.length()) { 678 int pos = out.position(); 679 char[] temp = out.array(); 680 out = CharBuffer.allocate(cc.bytes.limit()); 681 out.put(temp); 682 out.position(pos); 683 CoderResult cr = decoder.flush(out); 684 if (cr.isOverflow()) { 685 logln("Overflow error with flushing decodering"); 686 } 687 } 688 689 cc.toUnicodeResult = out; 690 691 ok = checkToUnicode(cc, out.limit()); 692 if (!ok) { 693 break; 694 } 695 } catch (Exception e) { 696 //check the error code to see if it matches cc.errorCode 697 logln("Decoder returned an error code"); 698 logln("ErrorCode expected is: " + cc.outErrorCode); 699 logln("Error Result is: " + e.toString()); 700 } 701 break; 702 } 703 704 return; 705 } 706 707 708 709 stepToUnicode(ConversionCase cc, CharsetDecoder decoder, int step)710 private int stepToUnicode(ConversionCase cc, CharsetDecoder decoder, 711 int step) 712 713 { 714 ByteBuffer source; 715 CharBuffer target; 716 boolean flush = false; 717 int sourceLen; 718 source = cc.bytes; 719 sourceLen = cc.bytes.limit(); 720 source.position(0); 721 target = CharBuffer.allocate(cc.unicode.length() + 4); 722 target.position(0); 723 cc.toUnicodeResult = null; 724 decoder.reset(); 725 726 if (step >= 0) { 727 728 int iStep = step; 729 int oStep = step; 730 731 for (;;) { 732 733 if (step != 0) { 734 source.limit((iStep <= sourceLen) ? iStep : sourceLen); 735 target.limit((oStep <= target.capacity()) ? oStep : target 736 .capacity()); 737 flush = (cc.finalFlush && source.limit() == sourceLen); 738 739 } else { 740 //bulk mode 741 source.limit(sourceLen); 742 target.limit(target.capacity()); 743 flush = cc.finalFlush; 744 } 745 // convert 746 CoderResult cr = null; 747 if (source.hasRemaining()) { 748 749 cr = decoder.decode(source, target, flush); 750 // check pointers and errors 751 if (cr.isOverflow()) { 752 // the partial target is filled, set a new limit, 753 oStep = (target.position() + step); 754 target.limit((oStep < target.capacity()) ? oStep 755 : target.capacity()); 756 if (target.limit() > target.capacity()) { 757 //target has reached its limit, an error occurred or test case has an error code 758 //check error code 759 logln("UnExpected error: Target Buffer is larger than capacity"); 760 break; 761 } 762 763 } else if (cr.isError()) { 764 //check the error code to see if it matches cc.errorCode 765 logln("Decoder returned an error code"); 766 logln("ErrorCode expected is: " + cc.outErrorCode); 767 logln("Error Result is: " + cr.toString()); 768 break; 769 } 770 771 } else { 772 if (source.limit() == sourceLen) { 773 774 cr = decoder.decode(source, target, true); 775 776 //due to limitation of the API we need to check for target limit for expected 777 if (target.position() != cc.unicode.length()) { 778 if (target.limit() != cc.unicode.length()) { 779 target.limit(cc.unicode.length()); 780 } 781 cr = decoder.flush(target); 782 if (cr.isError()) { 783 errln("Flush operation failed"); 784 } 785 } 786 break; 787 } 788 } 789 iStep += step; 790 791 } 792 793 }// if(step ==0) 794 795 //-------------------------------------------------------------------------- 796 else /* step<0 */{ 797 /* 798 * step==-1: call only ucnv_getNextUChar() 799 * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar() 800 * if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input, 801 * else give it at most (-step-2)/2 bytes 802 */ 803 804 for (;;) { 805 // convert 806 if ((step & 1) != 0 /* odd: -1, -3, -5, ... */) { 807 808 target.limit(target.position() < target.capacity() ? target 809 .position() + 1 : target.capacity()); 810 811 // decode behavior is return to output target 1 character 812 CoderResult cr = null; 813 814 //similar to getNextUChar() , input is the whole string, while outputs only 1 character 815 source.limit(sourceLen); 816 while (target.position() != target.limit() 817 && source.hasRemaining()) { 818 cr = decoder.decode(source, target, 819 source.limit() == sourceLen); 820 821 if (cr.isOverflow()) { 822 823 if (target.limit() >= target.capacity()) { 824 // target has reached its limit, an error occurred 825 logln("UnExpected error: Target Buffer is larger than capacity"); 826 break; 827 } else { 828 //1 character has been consumed 829 target.limit(target.position() + 1); 830 break; 831 } 832 } else if (cr.isError()) { 833 logln("Decoder returned an error code"); 834 logln("ErrorCode expected is: " + cc.outErrorCode); 835 logln("Error Result is: " + cr.toString()); 836 837 cc.toUnicodeResult = target; 838 return target.position(); 839 } 840 841 else { 842 // one character has been consumed 843 if (target.limit() == target.position()) { 844 target.limit(target.position() + 1); 845 break; 846 } 847 } 848 849 } 850 if (source.position() == sourceLen) { 851 852 // due to limitation of the API we need to check 853 // for target limit for expected 854 cr = decoder.decode(source, target, true); 855 if (target.position() != cc.unicode.length()) { 856 857 target.limit(cc.unicode.length()); 858 cr = decoder.flush(target); 859 if (cr.isError()) { 860 errln("Flush operation failed"); 861 } 862 } 863 break; 864 } 865 // alternate between -n-1 and -n but leave -1 alone 866 if (step < -1) { 867 ++step; 868 } 869 } else {/* step is even */ 870 // allow only one UChar output 871 872 target.limit(target.position() < target.capacity() ? target 873 .position() + 1 : target.capacity()); 874 if (step == -2) { 875 source.limit(sourceLen); 876 } else { 877 source.limit(source.position() + (-step - 2) / 2); 878 if (source.limit() > sourceLen) { 879 source.limit(sourceLen); 880 } 881 } 882 CoderResult cr = decoder.decode(source, target, source 883 .limit() == sourceLen); 884 // check pointers and errors 885 if (cr.isOverflow()) { 886 // one character has been consumed 887 if (target.limit() >= target.capacity()) { 888 // target has reached its limit, an error occurred 889 logln("Unexpected error: Target Buffer is larger than capacity"); 890 break; 891 } 892 } else if (cr.isError()) { 893 logln("Decoder returned an error code"); 894 logln("ErrorCode expected is: " + cc.outErrorCode); 895 logln("Error Result is: " + cr.toString()); 896 break; 897 } 898 899 --step; 900 } 901 } 902 } 903 904 //-------------------------------------------------------------------------- 905 906 cc.toUnicodeResult = target; 907 return target.position(); 908 } 909 910 911 checkToUnicode(ConversionCase cc, int resultLength)912 private boolean checkToUnicode(ConversionCase cc, int resultLength) { 913 return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult); 914 } 915 916 TestGetUnicodeSet(DataMap testcase)917 private void TestGetUnicodeSet(DataMap testcase) { 918 /* 919 * charset - will be opened, and ucnv_getUnicodeSet() called on it // 920 * map - set of code points and strings that must be in the returned set // 921 * mapnot - set of code points and strings that must *not* be in the // 922 * returned set // which - numeric UConverterUnicodeSet value Headers { 923 * "charset", "map", "mapnot", "which" } 924 */ 925 926 927 // retrieve test case data 928 ConversionCase cc = new ConversionCase(); 929 CharsetProviderICU provider = new CharsetProviderICU(); 930 CharsetICU charset ; 931 932 933 UnicodeSet mapset = new UnicodeSet(); 934 UnicodeSet mapnotset = new UnicodeSet(); 935 UnicodeSet unicodeset = new UnicodeSet(); 936 String ellipsis = "0x2e"; 937 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")) 938 .getString(); 939 cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString(); 940 cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot")) 941 .getString(); 942 943 944 cc.which = ((ICUResourceBundle) testcase.getObject("which")).getInt(); // only checking for ROUNDTRIP_SET 945 946 // Android patch: Skip tests that fail with customized data. 947 String [] testsToSkip = { 948 "HZ", 949 "ibm-1390", 950 "ibm-16684", 951 "ibm-25546", 952 "ibm-971", 953 "ISO-2022-CN", 954 "ISO-2022-JP", 955 "ISO-2022-JP-2", 956 "ISO-2022-KR", 957 "JIS7", 958 }; 959 for (int i = 0; i < testsToSkip.length; i++) { 960 if (cc.charset.equals(testsToSkip[i])) { 961 logln(""); 962 logln("Skipping: " + cc.charset); 963 logln("..............................................."); 964 return; 965 } 966 } 967 // Android patch end. 968 969 // ----for debugging only 970 logln(""); 971 logln("TestGetUnicodeSet[" + cc.charset + "] "); 972 logln("..............................................."); 973 974 try{ 975 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata 976 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*') 977 ? (CharsetICU) provider.charsetForName(cc.charset.substring(1), 978 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) 979 : (CharsetICU) provider.charsetForName(cc.charset); 980 981 //checking for converter that are not supported at this point 982 try{ 983 if(charset==null || 984 charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" || 985 charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" || 986 charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || 987 charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){ 988 logln("Converter not supported at this point :" + cc.charset); 989 return; 990 } 991 992 if(cc.which==1){ 993 logln("Fallback set not supported at this point for converter : "+charset.displayName()); 994 return; 995 } 996 997 }catch(Exception e){ 998 return; 999 } 1000 1001 mapset.clear(); 1002 mapnotset.clear(); 1003 1004 mapset.applyPattern(cc.map,false); 1005 mapnotset.applyPattern(cc.mapnot,false); 1006 1007 charset.getUnicodeSet(unicodeset, cc.which); 1008 UnicodeSet diffset = new UnicodeSet(); 1009 1010 //are there items that must be in unicodeset but are not? 1011 (diffset = mapset).removeAll(unicodeset); 1012 if(!diffset.isEmpty()){ 1013 StringBuffer s = new StringBuffer(diffset.toPattern(true)); 1014 if(s.length()>100){ 1015 s.replace(0, 0x7fffffff, ellipsis); 1016 } 1017 errln("error in missing items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString()); 1018 } 1019 1020 //are the items that must not be in unicodeset but are? 1021 (diffset=mapnotset).retainAll(unicodeset); 1022 if(!diffset.isEmpty()){ 1023 StringBuffer s = new StringBuffer(diffset.toPattern(true)); 1024 if(s.length()>100){ 1025 s.replace(0, 0x7fffffff, ellipsis); 1026 } 1027 errln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString()); 1028 } 1029 } catch (Exception e) { 1030 errln("getUnicodeSet returned an error code"); 1031 errln("ErrorCode expected is: " + cc.outErrorCode); 1032 errln("Error Result is: " + e.toString()); 1033 return; 1034 } 1035 } 1036 1037 /** 1038 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the 1039 * start of the stream for example U+FEFF (the Unicode BOM/signature 1040 * character) that can be ignored. 1041 * 1042 * Detects Unicode signature byte sequences at the start of the byte stream 1043 * and returns number of bytes of the BOM of the indicated Unicode charset. 1044 * 0 is returned when no Unicode signature is recognized. 1045 * 1046 */ 1047 detectUnicodeSignature(ByteBuffer source)1048 private String detectUnicodeSignature(ByteBuffer source) { 1049 int signatureLength = 0; // number of bytes of the signature 1050 final int SIG_MAX_LEN = 5; 1051 String sigUniCharset = null; // states what unicode charset is the BOM 1052 int i = 0; 1053 1054 /* 1055 * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we 1056 * don't misdetect something 1057 */ 1058 byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, 1059 (byte) 0xa5 }; 1060 1061 while (i < source.limit() && i < SIG_MAX_LEN) { 1062 start[i] = source.get(i); 1063 i++; 1064 } 1065 1066 if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) { 1067 signatureLength = 2; 1068 sigUniCharset = "UTF-16BE"; 1069 source.position(signatureLength); 1070 return sigUniCharset; 1071 } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) { 1072 if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) { 1073 signatureLength = 4; 1074 sigUniCharset = "UTF-32LE"; 1075 source.position(signatureLength); 1076 return sigUniCharset; 1077 } else { 1078 signatureLength = 2; 1079 sigUniCharset = "UTF-16LE"; 1080 source.position(signatureLength); 1081 return sigUniCharset; 1082 } 1083 } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB 1084 && start[2] == (byte) 0xBF) { 1085 signatureLength = 3; 1086 sigUniCharset = "UTF-8"; 1087 source.position(signatureLength); 1088 return sigUniCharset; 1089 } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00 1090 && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) { 1091 signatureLength = 4; 1092 sigUniCharset = "UTF-32BE"; 1093 source.position(signatureLength); 1094 return sigUniCharset; 1095 } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE 1096 && start[2] == (byte) 0xFF) { 1097 signatureLength = 3; 1098 sigUniCharset = "SCSU"; 1099 source.position(signatureLength); 1100 return sigUniCharset; 1101 } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE 1102 && start[2] == (byte) 0x28) { 1103 signatureLength = 3; 1104 sigUniCharset = "BOCU-1"; 1105 source.position(signatureLength); 1106 return sigUniCharset; 1107 } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F 1108 && start[2] == (byte) 0x76) { 1109 1110 if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) { 1111 signatureLength = 5; 1112 sigUniCharset = "UTF-7"; 1113 source.position(signatureLength); 1114 return sigUniCharset; 1115 } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39 1116 || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) { 1117 signatureLength = 4; 1118 sigUniCharset = "UTF-7"; 1119 source.position(signatureLength); 1120 return sigUniCharset; 1121 } 1122 } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73 1123 && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) { 1124 signatureLength = 4; 1125 sigUniCharset = "UTF-EBCDIC"; 1126 source.position(signatureLength); 1127 return sigUniCharset; 1128 } 1129 1130 /* no known Unicode signature byte sequence recognized */ 1131 return null; 1132 } 1133 printbytes(ByteBuffer buf, int pos)1134 String printbytes(ByteBuffer buf, int pos) { 1135 int cur = buf.position(); 1136 String res = " (" + pos + ")==["; 1137 for (int i = 0; i < pos; i++) { 1138 res += "(" + i + ")" + hex(buf.get(i) & 0xff).substring(2) + " "; 1139 } 1140 buf.position(cur); 1141 return res + "]"; 1142 } 1143 printchars(CharBuffer buf, int pos)1144 String printchars(CharBuffer buf, int pos) { 1145 int cur = buf.position(); 1146 String res = " (" + pos + ")==["; 1147 for (int i = 0; i < pos; i++) { 1148 res += "(" + i + ")" + hex(buf.get(i)) + " "; 1149 } 1150 buf.position(cur); 1151 return res + "]"; 1152 } 1153 checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected, ByteBuffer output)1154 private boolean checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected, 1155 ByteBuffer output) { 1156 1157 boolean res = true; 1158 expected.rewind(); 1159 output.limit(output.position()); 1160 output.rewind(); 1161 1162 // remove any BOM signature before checking 1163 if (!cc.charset.contains("UnicodeLittle") && !cc.charset.contains("UnicodeBig")) { 1164 detectUnicodeSignature(output); // sets the position to after the BOM 1165 output = output.slice(); // removes anything before the current position 1166 } 1167 1168 if (output.limit() != expected.limit()) { 1169 errln("Test failed: output length does not match expected for charset: " + cc.charset 1170 + " [" + cc.caseNr + "]"); 1171 res = false; 1172 } else { 1173 while (output.hasRemaining()) { 1174 if (output.get() != expected.get()) { 1175 errln("Test failed: output does not match expected for charset: " + cc.charset 1176 + " [" + cc.caseNr + "]"); 1177 res = false; 1178 break; 1179 } 1180 } 1181 } 1182 1183 if (res) { 1184 logln("[" + cc.caseNr + "]:" + cc.charset); 1185 logln("Input: " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length())); 1186 logln("Output: " + printbytes(output, output.limit())); 1187 logln("Expected: " + printbytes(expected, expected.limit())); 1188 logln("Passed"); 1189 } 1190 else { 1191 errln("[" + cc.caseNr + "]:" + cc.charset); 1192 errln("Input: " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length())); 1193 errln("Output: " + printbytes(output, output.limit())); 1194 errln("Expected: " + printbytes(expected, expected.limit())); 1195 errln("Failed"); 1196 } 1197 return res; 1198 } 1199 checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output)1200 private boolean checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output) { 1201 1202 boolean res = true; 1203 output.limit(output.position()); 1204 output.rewind(); 1205 1206 // test to see if the conversion matches actual results 1207 if (output.limit() != expected.length()) { 1208 errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]"); 1209 res = false; 1210 } else { 1211 for (int i = 0; i < expected.length(); i++) { 1212 if (output.get(i) != expected.charAt(i)) { 1213 errln("Test failed: output does not match expected for charset: " + cc.charset 1214 + " [" + cc.caseNr + "]"); 1215 res = false; 1216 break; 1217 } 1218 } 1219 } 1220 1221 if (res) { 1222 logln("[" + cc.caseNr + "]:" + cc.charset); 1223 logln("Input: " + printbytes(cc.bytes, cc.bytes.limit())); 1224 logln("Output: " + printchars(output, output.limit())); 1225 logln("Expected: " + printchars(CharBuffer.wrap(expected), expected.length())); 1226 logln("Passed"); 1227 } else { 1228 errln("[" + cc.caseNr + "]:" + cc.charset); 1229 errln("Input: " + printbytes(cc.bytes, cc.bytes.limit())); 1230 errln("Output: " + printchars(output, output.limit())); 1231 errln("Expected: " + printchars(CharBuffer.wrap(expected), expected.length())); 1232 errln("Failed"); 1233 } 1234 return res; 1235 } 1236 toByteArray(String str)1237 private byte[] toByteArray(String str) { 1238 byte[] ret = new byte[str.length()]; 1239 for (int i = 0; i < ret.length; i++) { 1240 char ch = str.charAt(i); 1241 if (ch <= 0xFF) { 1242 ret[i] = (byte) ch; 1243 } else { 1244 throw new IllegalArgumentException(" byte value out of range: " + ch); 1245 } 1246 } 1247 return ret; 1248 } 1249 } 1250