1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2008-2011, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.charset; 10 11 import java.nio.ByteBuffer; 12 import java.nio.CharBuffer; 13 import java.nio.IntBuffer; 14 import java.nio.charset.CharsetDecoder; 15 import java.nio.charset.CharsetEncoder; 16 import java.nio.charset.CoderResult; 17 18 import com.ibm.icu.lang.UCharacter; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeSet; 21 22 /** 23 * @author krajwade 24 * 25 */ 26 class CharsetSCSU extends CharsetICU{ 27 /* SCSU definitions --------------------------------------------------------- */ 28 29 /* SCSU command byte values */ 30 //enum { 31 private static final short SQ0=0x01; /* Quote from window pair 0 */ 32 private static final short SQ7=0x08; /* Quote from window pair 7 */ 33 private static final short SDX=0x0B; /* Define a window as extended */ 34 //private static final short Srs=0x0C; /* reserved */ 35 private static final short SQU=0x0E; /* Quote a single Unicode character */ 36 private static final short SCU=0x0F; /* Change to Unicode mode */ 37 private static final short SC0=0x10; /* Select window 0 */ 38 private static final short SC7=0x17; /* Select window 7 */ 39 private static final short SD0=0x18; /* Define and select window 0 */ 40 //private static final short SD7=0x1F; /* Define and select window 7 */ 41 42 private static final short UC0=0xE0; /* Select window 0 */ 43 private static final short UC7=0xE7; /* Select window 7 */ 44 private static final short UD0=0xE8; /* Define and select window 0 */ 45 private static final short UD7=0xEF; /* Define and select window 7 */ 46 private static final short UQU=0xF0; /* Quote a single Unicode character */ 47 private static final short UDX=0xF1; /* Define a Window as extended */ 48 private static final short Urs=0xF2; /* reserved */ 49 // }; 50 51 // enum { 52 /* 53 * Unicode code points from 3400 to E000 are not adressible by 54 * dynamic window, since in these areas no short run alphabets are 55 * found. Therefore add gapOffset to all values from gapThreshold. 56 */ 57 private static final int gapThreshold=0x68; 58 private static final int gapOffset = 0xAC00 ; 59 /* values between reservedStart and fixedThreshold are reserved */ 60 private static final int reservedStart=0xA8; 61 /* use table of predefined fixed offsets for values from fixedThreshold */ 62 private static final int fixedThreshold=0xF9; 63 //}; 64 65 protected byte[] fromUSubstitution = new byte[]{(byte)0x0E,(byte)0xFF, (byte)0xFD}; 66 67 /* constant offsets for the 8 static windows */ 68 private static final int staticOffsets[]={ 69 0x0000, /* ASCII for quoted tags */ 70 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 71 0x0100, /* Latin Extended-A */ 72 0x0300, /* Combining Diacritical Marks */ 73 0x2000, /* General Punctuation */ 74 0x2080, /* Currency Symbols */ 75 0x2100, /* Letterlike Symbols and Number Forms */ 76 0x3000 /* CJK Symbols and punctuation */ 77 }; 78 79 /* initial offsets for the 8 dynamic (sliding) windows */ 80 private static final int initialDynamicOffsets[]={ 81 0x0080, /* Latin-1 */ 82 0x00C0, /* Latin Extended A */ 83 0x0400, /* Cyrillic */ 84 0x0600, /* Arabic */ 85 0x0900, /* Devanagari */ 86 0x3040, /* Hiragana */ 87 0x30A0, /* Katakana */ 88 0xFF00 /* Fullwidth ASCII */ 89 }; 90 91 /* Table of fixed predefined Offsets */ 92 private static final int fixedOffsets[]={ 93 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 94 /* 0xFA */ 0x0250, /* IPA extensions */ 95 /* 0xFB */ 0x0370, /* Greek */ 96 /* 0xFC */ 0x0530, /* Armenian */ 97 /* 0xFD */ 0x3040, /* Hiragana */ 98 /* 0xFE */ 0x30A0, /* Katakana */ 99 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 100 }; 101 102 /* state values */ 103 //enum { 104 private static final int readCommand=0; 105 private static final int quotePairOne=1; 106 private static final int quotePairTwo=2; 107 private static final int quoteOne=3; 108 private static final int definePairOne=4; 109 private static final int definePairTwo=5; 110 private static final int defineOne=6; 111 // }; 112 113 private final static class SCSUData { 114 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ 115 int toUDynamicOffsets[] = new int[8] ; 116 int fromUDynamicOffsets[] = new int[8] ; 117 118 /* state machine state - toUnicode */ 119 boolean toUIsSingleByteMode; 120 short toUState; 121 byte toUQuoteWindow, toUDynamicWindow; 122 short toUByteOne; 123 124 /* state machine state - fromUnicode */ 125 boolean fromUIsSingleByteMode; 126 byte fromUDynamicWindow; 127 128 /* 129 * windowUse[] keeps track of the use of the dynamic windows: 130 * At nextWindowUseIndex there is the least recently used window, 131 * and the following windows (in a wrapping manner) are more and more 132 * recently used. 133 * At nextWindowUseIndex-1 there is the most recently used window. 134 */ 135 byte locale; 136 byte nextWindowUseIndex; 137 byte windowUse[] = new byte[8]; 138 SCSUData()139 SCSUData(){ 140 initialize(); 141 } 142 initialize()143 void initialize(){ 144 for(int i=0;i<8;i++){ 145 this.toUDynamicOffsets[i] = initialDynamicOffsets[i]; 146 } 147 this.toUIsSingleByteMode = true; 148 this.toUState = readCommand; 149 this.toUQuoteWindow = 0; 150 this.toUDynamicWindow = 0; 151 this.toUByteOne = 0; 152 this.fromUIsSingleByteMode = true; 153 this.fromUDynamicWindow = 0; 154 for(int i=0;i<8;i++){ 155 this.fromUDynamicOffsets[i] = initialDynamicOffsets[i]; 156 } 157 this.nextWindowUseIndex = 0; 158 switch(this.locale){ 159 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */ 160 /* case l_ja: 161 for(int i=0;i<8;i++){ 162 this.windowUse[i] = initialWindowUse_ja[i]; 163 } 164 break; */ 165 default: 166 for(int i=0;i<8;i++){ 167 this.windowUse[i] = initialWindowUse[i]; 168 } 169 170 } 171 } 172 } 173 174 static final byte initialWindowUse[]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 175 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */ 176 // static final byte initialWindowUse_ja[]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 177 178 //enum { 179 //private static final int lGeneric = 0; 180 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */ 181 // private static final int l_ja = 1; 182 //}; 183 184 private SCSUData extraInfo = null; 185 CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases)186 public CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases){ 187 super(icuCanonicalName, javaCanonicalName, aliases); 188 maxBytesPerChar = 3; 189 minBytesPerChar = 1; 190 maxCharsPerByte = 1; 191 extraInfo = new SCSUData(); 192 } 193 194 class CharsetDecoderSCSU extends CharsetDecoderICU { 195 /* label values for supporting behavior similar to goto in C */ 196 private static final int FastSingle=0; 197 private static final int SingleByteMode=1; 198 private static final int EndLoop=2; 199 200 /* Mode Type */ 201 private static final int ByteMode = 0; 202 private static final int UnicodeMode =1; 203 CharsetDecoderSCSU(CharsetICU cs)204 public CharsetDecoderSCSU(CharsetICU cs) { 205 super(cs); 206 implReset(); 207 } 208 209 //private SCSUData data ; 210 @Override implReset()211 protected void implReset(){ 212 super.implReset(); 213 toULength = 0; 214 extraInfo.initialize(); 215 } 216 217 short b; 218 219 //Get the state machine state 220 private boolean isSingleByteMode ; 221 private short state ; 222 private byte quoteWindow ; 223 private byte dynamicWindow ; 224 private short byteOne; 225 226 227 //sourceIndex=-1 if the current character began in the previous buffer 228 private int sourceIndex ; 229 private int nextSourceIndex ; 230 231 CoderResult cr; 232 SCSUData data ; 233 private boolean LabelLoop;// used to break the while loop 234 235 @Override decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush)236 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, 237 boolean flush){ 238 data = extraInfo; 239 240 //Get the state machine state 241 isSingleByteMode = data.toUIsSingleByteMode; 242 state = data.toUState; 243 quoteWindow = data.toUQuoteWindow; 244 dynamicWindow = data.toUDynamicWindow; 245 byteOne = data.toUByteOne; 246 247 LabelLoop = true; 248 249 //sourceIndex=-1 if the current character began in the previous buffer 250 sourceIndex = data.toUState == readCommand ? 0: -1 ; 251 nextSourceIndex = 0; 252 253 cr = CoderResult.UNDERFLOW; 254 int labelType = 0; 255 while(LabelLoop){ 256 if(isSingleByteMode){ 257 switch(labelType){ 258 case FastSingle: 259 /*fast path for single-byte mode*/ 260 labelType = fastSingle(source, target, offsets, ByteMode); 261 break; 262 case SingleByteMode: 263 /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */ 264 labelType = singleByteMode(source, target, offsets, ByteMode); 265 break; 266 case EndLoop: 267 endLoop(source, target, offsets); 268 break; 269 } 270 }else{ 271 switch(labelType){ 272 case FastSingle: 273 /*fast path for single-byte mode*/ 274 labelType = fastSingle(source, target, offsets, UnicodeMode); 275 break; 276 case SingleByteMode: 277 /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */ 278 labelType = singleByteMode(source, target, offsets, UnicodeMode); 279 break; 280 case EndLoop: 281 endLoop(source, target, offsets); 282 break; 283 } 284 //LabelLoop = false; 285 } 286 } 287 return cr; 288 } 289 fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType)290 private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){ 291 int label = 0; 292 if(modeType==ByteMode){ 293 294 if(state==readCommand){ 295 while(source.hasRemaining() && target.hasRemaining() && (b=(short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK)) >= 0x20){ 296 source.position(source.position()+1); 297 ++nextSourceIndex; 298 if(b <= 0x7f){ 299 /*Write US graphic character or DEL*/ 300 target.put((char)b); 301 if(offsets != null){ 302 offsets.put(sourceIndex); 303 } 304 }else{ 305 /*Write from dynamic window*/ 306 int c = data.toUDynamicOffsets[dynamicWindow] + (b&0x7f); 307 if(c <= 0xffff){ 308 target.put((char)c); 309 if(offsets != null){ 310 offsets.put(sourceIndex); 311 } 312 }else{ 313 /*Output surrogate pair */ 314 target.put((char)(0xd7c0 + (c>>10))); 315 if(target.hasRemaining()){ 316 target.put((char)(0xdc00 | (c&0x3ff))); 317 if(offsets != null){ 318 offsets.put(sourceIndex); 319 offsets.put(sourceIndex); 320 } 321 }else{ 322 /* target overflow */ 323 if(offsets != null){ 324 offsets.put(sourceIndex); 325 } 326 charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff)); 327 charErrorBufferLength = 1; 328 label = EndLoop; 329 cr = CoderResult.OVERFLOW; 330 return label; 331 } 332 } 333 } 334 sourceIndex = nextSourceIndex; 335 } 336 // label = SingleByteMode; 337 } 338 }else if(modeType==UnicodeMode){ 339 /* fast path for unicode mode */ 340 if(state == readCommand){ 341 while((source.position()+1)<source.limit() && target.hasRemaining() && (((b=source.get(source.position()))-UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs-UC0)){ 342 target.put((char)((b<<8)|(source.get(source.position()+1)&UConverterConstants.UNSIGNED_BYTE_MASK))); 343 if(offsets != null){ 344 offsets.put(sourceIndex); 345 } 346 sourceIndex = nextSourceIndex; 347 nextSourceIndex+=2; 348 source.position(source.position()+2); 349 } 350 } 351 } 352 label = SingleByteMode; 353 return label; 354 } 355 singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType)356 private int singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){ 357 int label = SingleByteMode; 358 if(modeType == ByteMode){ 359 while(source.hasRemaining()){ 360 if(!target.hasRemaining()){ 361 cr = CoderResult.OVERFLOW; 362 label = EndLoop; 363 return label; 364 } 365 b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK); 366 ++nextSourceIndex; 367 switch(state){ 368 case readCommand: 369 /*redundant conditions are commented out */ 370 if(((1L<<b)&0x2601)!=0){ 371 target.put((char)b); 372 if(offsets != null){ 373 offsets.put(sourceIndex); 374 } 375 sourceIndex = nextSourceIndex; 376 label = FastSingle; 377 return label; 378 }else if(SC0 <= b){ 379 if(b<=SC7){ 380 dynamicWindow = (byte)(b-SC0); 381 sourceIndex = nextSourceIndex; 382 label = FastSingle; 383 return label; 384 }else /* if(SD0<=b && b<=SQ7)*/{ 385 dynamicWindow = (byte)(b - SD0); 386 state = defineOne; 387 } 388 }else if(/* SQ0<=b &&*/b <= SQ7){ 389 quoteWindow = (byte)(b - SQ0); 390 state = quoteOne; 391 }else if(b==SDX){ 392 state = definePairOne; 393 }else if(b==SQU){ 394 state = quotePairOne; 395 }else if(b==SCU){ 396 sourceIndex = nextSourceIndex; 397 isSingleByteMode = false; 398 label = FastSingle; 399 return label; 400 }else{ 401 /*callback (illegal)*/ 402 cr = CoderResult.malformedForLength(1); 403 toUBytesArray[0] = (byte)b; 404 toULength =1; 405 label = EndLoop; 406 return label; 407 } 408 409 /* Store the first byte of a multibyte sequence in toUByte[] */ 410 toUBytesArray[0] = (byte)b; 411 toULength = 1; 412 break; 413 case quotePairOne: 414 byteOne = b; 415 toUBytesArray[1] = (byte)b; 416 toULength = 2; 417 state = quotePairTwo; 418 break; 419 case quotePairTwo: 420 target.put((char)((byteOne<< 8) | b)); 421 if(offsets != null){ 422 offsets.put(sourceIndex); 423 } 424 sourceIndex = nextSourceIndex; 425 state = readCommand; 426 label = FastSingle; 427 return label; 428 case quoteOne: 429 if(b<0x80){ 430 /* all static offsets are in the BMP */ 431 target.put((char)(staticOffsets[quoteWindow] + b)); 432 if(offsets != null){ 433 offsets.put(sourceIndex); 434 } 435 }else { 436 /*write from dynamic window */ 437 int c = data.toUDynamicOffsets[quoteWindow] + (b&0x7f); 438 if(c<=0xffff){ 439 target.put((char)c); 440 if(offsets != null){ 441 offsets.put(sourceIndex); 442 } 443 }else { 444 /* output surrogate pair */ 445 target.put((char)(0xd7c0+(c>>10))); 446 if(target.hasRemaining()){ 447 target.put((char)(0xdc00 | (c&0x3ff))); 448 if(offsets != null){ 449 offsets.put(sourceIndex); 450 offsets.put(sourceIndex); 451 } 452 }else { 453 /* target overflow */ 454 if(offsets != null){ 455 offsets.put(sourceIndex); 456 } 457 charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff)); 458 charErrorBufferLength = 1; 459 label = EndLoop; 460 cr = CoderResult.OVERFLOW; 461 LabelLoop = false; 462 return label; 463 } 464 } 465 } 466 sourceIndex = nextSourceIndex; 467 state = readCommand; 468 label = FastSingle; 469 return label; 470 case definePairOne: 471 dynamicWindow = (byte)((b>>5)&7); 472 byteOne = (byte)(b&0x1f); 473 toUBytesArray[1] = (byte)b; 474 toULength = 2; 475 state = definePairTwo; 476 break; 477 case definePairTwo: 478 data.toUDynamicOffsets[dynamicWindow] = 0x10000 + (byteOne<<15L | b<<7L); 479 sourceIndex = nextSourceIndex; 480 state = readCommand; 481 label = FastSingle; 482 return label; 483 case defineOne: 484 if(b==0){ 485 /*callback (illegal)*/ 486 toUBytesArray[1] = (byte)b; 487 toULength =2; 488 label = EndLoop; 489 return label; 490 }else if(b<gapThreshold){ 491 data.toUDynamicOffsets[dynamicWindow] = b<<7L; 492 }else if(((b - gapThreshold)&UConverterConstants.UNSIGNED_BYTE_MASK)<(reservedStart - gapThreshold)){ 493 data.toUDynamicOffsets[dynamicWindow] = (b<<7L) + gapOffset; 494 }else if(b>=fixedThreshold){ 495 data.toUDynamicOffsets[dynamicWindow] = fixedOffsets[b-fixedThreshold]; 496 }else{ 497 /*callback (illegal)*/ 498 toUBytesArray[1] = (byte)b; 499 toULength =2; 500 label = EndLoop; 501 return label; 502 } 503 sourceIndex = nextSourceIndex; 504 state = readCommand; 505 label = FastSingle; 506 return label; 507 } 508 } 509 510 }else if(modeType==UnicodeMode){ 511 while(source.hasRemaining()){ 512 if(!target.hasRemaining()){ 513 cr = CoderResult.OVERFLOW; 514 label = EndLoop; 515 return label; 516 } 517 b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK); 518 ++nextSourceIndex; 519 switch(state){ 520 case readCommand: 521 if((short)((b -UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs - UC0)){ 522 byteOne = b; 523 toUBytesArray[0] = (byte)b; 524 toULength = 1; 525 state = quotePairTwo; 526 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UC7){ 527 dynamicWindow = (byte)(b - UC0); 528 sourceIndex = nextSourceIndex; 529 isSingleByteMode = true; 530 label = FastSingle; 531 return label; 532 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UD7){ 533 dynamicWindow = (byte)(b - UD0); 534 isSingleByteMode = true; 535 toUBytesArray[0] = (byte)b; 536 toULength = 1; 537 state = defineOne; 538 label = SingleByteMode; 539 return label; 540 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UDX){ 541 isSingleByteMode = true; 542 toUBytesArray[0] = (byte)b; 543 toULength = 1; 544 state = definePairOne; 545 label = SingleByteMode; 546 return label; 547 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UQU){ 548 toUBytesArray[0] = (byte)b; 549 toULength = 1; 550 state = quotePairOne; 551 }else { 552 /* callback (illegal)*/ 553 cr = CoderResult.malformedForLength(1); 554 toUBytesArray[0] = (byte)b; 555 toULength = 1; 556 label = EndLoop; 557 return label; 558 } 559 break; 560 case quotePairOne: 561 byteOne = b; 562 toUBytesArray[1] = (byte)b; 563 toULength = 2; 564 state = quotePairTwo; 565 break; 566 case quotePairTwo: 567 target.put((char)((byteOne<<8) | b)); 568 if(offsets != null){ 569 offsets.put(sourceIndex); 570 } 571 sourceIndex = nextSourceIndex; 572 state = readCommand; 573 label = FastSingle; 574 return label; 575 } 576 } 577 } 578 label = EndLoop; 579 return label; 580 } 581 endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets)582 private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ 583 if(cr==CoderResult.OVERFLOW){ 584 state = readCommand; 585 }else if(state == readCommand){ 586 toULength = 0; 587 } 588 data.toUIsSingleByteMode = isSingleByteMode; 589 data.toUState = state; 590 data.toUQuoteWindow = quoteWindow; 591 data.toUDynamicWindow = dynamicWindow; 592 data.toUByteOne = byteOne; 593 LabelLoop = false; 594 } 595 } 596 597 class CharsetEncoderSCSU extends CharsetEncoderICU{ CharsetEncoderSCSU(CharsetICU cs)598 public CharsetEncoderSCSU(CharsetICU cs) { 599 super(cs, fromUSubstitution); 600 implReset(); 601 } 602 603 //private SCSUData data; 604 @Override implReset()605 protected void implReset() { 606 super.implReset(); 607 extraInfo.initialize(); 608 } 609 610 /* label values for supporting behavior similar to goto in C */ 611 private static final int Loop=0; 612 private static final int GetTrailUnicode=1; 613 private static final int OutputBytes=2; 614 private static final int EndLoop =3; 615 616 private int delta; 617 private int length; 618 619 ///variables of compression heuristics 620 private int offset; 621 private char lead, trail; 622 private int code; 623 private byte window; 624 625 //Get the state machine state 626 private boolean isSingleByteMode; 627 private byte dynamicWindow ; 628 private int currentOffset; 629 int c; 630 631 SCSUData data ; 632 633 //sourceIndex=-1 if the current character began in the previous buffer 634 private int sourceIndex ; 635 private int nextSourceIndex; 636 private int targetCapacity; 637 638 private boolean LabelLoop;//used to break the while loop 639 private boolean AfterGetTrail;// its value is set to true in order to ignore the code before getTrailSingle: 640 private boolean AfterGetTrailUnicode;// is value is set to true in order to ignore the code before getTrailUnicode: 641 642 CoderResult cr; 643 644 @Override encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)645 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 646 data = extraInfo; 647 cr = CoderResult.UNDERFLOW; 648 649 //Get the state machine state 650 isSingleByteMode = data.fromUIsSingleByteMode; 651 dynamicWindow = data.fromUDynamicWindow; 652 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 653 c = fromUChar32; 654 655 sourceIndex = c== 0 ? 0: -1 ; 656 nextSourceIndex = 0; 657 658 659 targetCapacity = target.limit()-target.position(); 660 661 //sourceIndex=-1 if the current character began in the previous buffer 662 sourceIndex = c== 0 ? 0: -1 ; 663 nextSourceIndex = 0; 664 665 int labelType = Loop; // set to Loop so that the code starts from loop: 666 LabelLoop = true; 667 AfterGetTrail = false; 668 AfterGetTrailUnicode = false; 669 670 while(LabelLoop){ 671 switch(labelType){ 672 case Loop: 673 labelType = loop(source, target, offsets); 674 break; 675 case GetTrailUnicode: 676 labelType = getTrailUnicode(source, target, offsets); 677 break; 678 case OutputBytes: 679 labelType = outputBytes(source, target, offsets); 680 break; 681 case EndLoop: 682 endLoop(source, target, offsets); 683 break; 684 } 685 } 686 return cr; 687 } 688 getWindow(int[] offsets)689 private byte getWindow(int[] offsets){ 690 int i; 691 for (i=0;i<8;i++){ 692 if(((c-offsets[i]) & UConverterConstants.UNSIGNED_INT_MASK) <= 0x7f){ 693 return (byte)i; 694 } 695 } 696 return -1; 697 } 698 isInOffsetWindowOrDirect(int offsetValue, int a)699 private boolean isInOffsetWindowOrDirect(int offsetValue, int a){ 700 return (a & UConverterConstants.UNSIGNED_INT_MASK)<=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK)+0x7f & 701 ((a & UConverterConstants.UNSIGNED_INT_MASK)>=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK) || 702 ((a & UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && ((a & UConverterConstants.UNSIGNED_INT_MASK)>=0x20 703 || ((1L<<(a & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0))); 704 } 705 getNextDynamicWindow()706 private byte getNextDynamicWindow(){ 707 byte windowValue = data.windowUse[data.nextWindowUseIndex]; 708 if(++data.nextWindowUseIndex==8){ 709 data.nextWindowUseIndex=0; 710 } 711 return windowValue; 712 } 713 useDynamicWindow(byte windowValue)714 private void useDynamicWindow(byte windowValue){ 715 /*first find the index of the window*/ 716 int i,j; 717 i = data.nextWindowUseIndex; 718 do{ 719 if(--i<0){ 720 i=7; 721 } 722 }while(data.windowUse[i]!=windowValue); 723 724 /*now copy each window[i+1] to [i]*/ 725 j= i+1; 726 if(j==8){ 727 j=0; 728 } 729 while(j!=data.nextWindowUseIndex){ 730 data.windowUse[i] = data.windowUse[j]; 731 i=j; 732 if(++j==8){ 733 j=0; 734 } 735 } 736 737 /*finally, set the window into the most recently used index*/ 738 data.windowUse[i]= windowValue; 739 } 740 741 getDynamicOffset()742 private int getDynamicOffset(){ 743 int i; 744 for(i=0;i<7;++i){ 745 if(((c-fixedOffsets[i])&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){ 746 offset = fixedOffsets[i]; 747 return 0xf9+i; 748 } 749 } 750 if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x80){ 751 /*No dynamic window for US-ASCII*/ 752 return -1; 753 }else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x3400 || ((c-0x10000)&UConverterConstants.UNSIGNED_INT_MASK)<(0x14000-0x10000) || 754 ((c-0x1d000)&UConverterConstants.UNSIGNED_INT_MASK)<=(0x1ffff-0x1d000)){ 755 /*This character is in the code range for a "small", i.e, reasonably windowable, script*/ 756 offset = c&0x7fffff80; 757 return (c>>7); 758 }else if(0xe000<=(c&UConverterConstants.UNSIGNED_INT_MASK) && (c&UConverterConstants.UNSIGNED_INT_MASK)!=0xfeff && (c&UConverterConstants.UNSIGNED_INT_MASK) < 0xfff0){ 759 /*for these characters we need to take the gapOffset into account*/ 760 offset=(c)&0x7fffff80; 761 return ((c-gapOffset)>>7); 762 }else{ 763 return -1; 764 } 765 } 766 loop(CharBuffer source, ByteBuffer target, IntBuffer offsets)767 private int loop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 768 int label = 0; 769 if(isSingleByteMode){ 770 if(c!=0 && targetCapacity>0 && !AfterGetTrail){ 771 label = getTrail(source, target, offsets); 772 return label; 773 } 774 /*state machine for single byte mode*/ 775 while(AfterGetTrail || source.hasRemaining()){ 776 if(targetCapacity<=0 && !AfterGetTrail){ 777 /*target is full*/ 778 cr = CoderResult.OVERFLOW; 779 label = EndLoop; 780 return label; 781 } 782 if(!AfterGetTrail){ 783 c = source.get(); 784 ++nextSourceIndex; 785 786 } 787 if(((c -0x20)&UConverterConstants.UNSIGNED_INT_MASK)<=0x5f && !AfterGetTrail){ 788 /*pass US-ASCII graphic character through*/ 789 target.put((byte)c); 790 if(offsets!=null){ 791 offsets.put(sourceIndex); 792 } 793 --targetCapacity; 794 }else if((c & UConverterConstants.UNSIGNED_INT_MASK)<0x20 && !AfterGetTrail){ 795 if(((1L<<(c & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0){ 796 /*CR/LF/TAB/NUL*/ 797 target.put((byte)c); 798 if(offsets!=null){ 799 offsets.put(sourceIndex); 800 } 801 --targetCapacity; 802 } else { 803 /*quote c0 control character*/ 804 c|=SQ0<<8; 805 length = 2; 806 label = OutputBytes; 807 return label; 808 } 809 } else if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && !AfterGetTrail){ 810 /*use the current dynamic window*/ 811 target.put((byte)(delta|0x80)); 812 if(offsets!=null){ 813 offsets.put(sourceIndex); 814 } 815 --targetCapacity; 816 } else if(AfterGetTrail || UTF16.isSurrogate((char)c)){ 817 if(!AfterGetTrail){ 818 if(UTF16.isLeadSurrogate((char)c)){ 819 label = getTrail(source, target, offsets); 820 if(label==EndLoop){ 821 return label; 822 } 823 } else { 824 /*this is unmatched lead code unit (2nd Surrogate)*/ 825 /*callback(illegal)*/ 826 cr = CoderResult.malformedForLength(1); 827 label = EndLoop; 828 return label; 829 } 830 } 831 832 833 if(AfterGetTrail){ 834 AfterGetTrail = false; 835 } 836 837 /*Compress supplementary character U+10000...U+10ffff */ 838 if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){ 839 /*use the current dynamic window*/ 840 target.put((byte)(delta|0x80)); 841 if(offsets!=null){ 842 offsets.put(sourceIndex); 843 } 844 --targetCapacity; 845 } else if((window=getWindow(data.fromUDynamicOffsets))>=0){ 846 /*there is a dynamic window that contains this character, change to it*/ 847 dynamicWindow = window; 848 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 849 useDynamicWindow(dynamicWindow); 850 c = ((SC0+dynamicWindow)<<8 | (c-currentOffset)|0x80); 851 length = 2; 852 label = OutputBytes; 853 return label; 854 } else if((code=getDynamicOffset())>=0){ 855 /*might check if there are come character in this window to come */ 856 /*define an extended window with this character*/ 857 code-=0x200; 858 dynamicWindow=getNextDynamicWindow(); 859 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; 860 useDynamicWindow(dynamicWindow); 861 c = ((SDX<<24) | (dynamicWindow<<21)| 862 (code<<8)| (c- currentOffset) |0x80); 863 // c = (((SDX)<<25) | (dynamicWindow<<21)| 864 // (code<<8)| (c- currentOffset) |0x80 ); 865 length = 4; 866 label = OutputBytes; 867 return label; 868 } else { 869 /*change to unicode mode and output this (lead, trail) pair*/ 870 isSingleByteMode = false; 871 target.put((byte)SCU); 872 if(offsets!=null){ 873 offsets.put(sourceIndex); 874 } 875 --targetCapacity; 876 c = (lead<<16)|trail; 877 length = 4; 878 label = OutputBytes; 879 return label; 880 } 881 } else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0xa0){ 882 /*quote C1 control character*/ 883 c = (c&0x7f) | (SQ0+1)<<8; /*SQ0+1 == SQ1*/ 884 length = 2; 885 label = OutputBytes; 886 return label; 887 } else if((c&UConverterConstants.UNSIGNED_INT_MASK)==0xfeff || (c&UConverterConstants.UNSIGNED_INT_MASK)>= 0xfff0){ 888 /*quote signature character = byte order mark and specials*/ 889 c |= SQU<<16; 890 length = 3; 891 label = OutputBytes; 892 return label; 893 } else { 894 /*compress all other BMP characters*/ 895 if((window=getWindow(data.fromUDynamicOffsets))>=0){ 896 /*there is a window defined that contains this character - switch to it or quote from it*/ 897 if(source.position()>=source.limit() || isInOffsetWindowOrDirect(data.fromUDynamicOffsets[window], source.get(source.position()))){ 898 /*change to dynamic window*/ 899 dynamicWindow = window; 900 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 901 useDynamicWindow(dynamicWindow); 902 c = ((SC0+window)<<8) | (c- currentOffset) | 0x80; 903 length = 2; 904 label = OutputBytes; 905 return label; 906 } else { 907 /*quote from dynamic window*/ 908 c = ((SQ0+window)<<8) | (c - data.fromUDynamicOffsets[window]) | 909 0x80; 910 length = 2; 911 label = OutputBytes; 912 return label; 913 } 914 } else if((window = getWindow(staticOffsets))>=0){ 915 /*quote from static window*/ 916 c = ((SQ0+window)<<8) | (c - staticOffsets[window]); 917 length = 2; 918 label = OutputBytes; 919 return label; 920 }else if((code=getDynamicOffset())>=0){ 921 /*define a dynamic window with this character*/ 922 dynamicWindow = getNextDynamicWindow(); 923 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; 924 useDynamicWindow(dynamicWindow); 925 c = ((SD0+dynamicWindow)<<16) | (code<<8)| 926 (c - currentOffset) | 0x80; 927 length = 3; 928 label = OutputBytes; 929 return label; 930 } else if(((int)((c-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && (source.position()>=source.limit() || 931 ((int)((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))< (0xd800 - 0x3400))){ 932 933 /* 934 * this character is not compressible (a BMP ideograph of similar) 935 * switch to Unicode mode if this is the last character in the block 936 * or there is at least one more ideograph following immediately 937 */ 938 isSingleByteMode = false; 939 c|=SCU<<16; 940 length =3; 941 label = OutputBytes; 942 return label; 943 } else { 944 /*quote Unicode*/ 945 c|=SQU<<16; 946 length = 3; 947 label = OutputBytes; 948 return label; 949 } 950 } 951 /*normal end of conversion : prepare for new character */ 952 c = 0; 953 sourceIndex = nextSourceIndex; 954 } 955 } else { 956 if(c!=0 && targetCapacity>0 && !AfterGetTrailUnicode){ 957 label = GetTrailUnicode; 958 return label; 959 } 960 961 /*state machine for Unicode*/ 962 /*unicodeByteMode*/ 963 while(AfterGetTrailUnicode || source.hasRemaining()){ 964 if(targetCapacity<=0 && !AfterGetTrailUnicode){ 965 /*target is full*/ 966 cr = CoderResult.OVERFLOW; 967 LabelLoop = false; 968 break; 969 } 970 if(!AfterGetTrailUnicode){ 971 c = source.get(); 972 ++nextSourceIndex; 973 } 974 975 if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && !AfterGetTrailUnicode){ 976 /*not compressible, write character directly */ 977 if(targetCapacity>=2){ 978 target.put((byte)(c>>8)); 979 target.put((byte)c); 980 if(offsets!=null){ 981 offsets.put(sourceIndex); 982 offsets.put(sourceIndex); 983 } 984 targetCapacity-=2; 985 } else { 986 length =2; 987 label = OutputBytes; 988 return label; 989 } 990 } else if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300*/&& !AfterGetTrailUnicode){ 991 /*compress BMP character if the following one is not an uncompressible ideograph*/ 992 if(!(source.hasRemaining() && (((source.get(source.position())-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400))){ 993 if(((((c-0x30)&UConverterConstants.UNSIGNED_INT_MASK))<10 || (((c-0x61)&UConverterConstants.UNSIGNED_INT_MASK))<26 994 || (((c-0x41)&UConverterConstants.UNSIGNED_INT_MASK))<26)){ 995 /*ASCII digit or letter*/ 996 isSingleByteMode = true; 997 c |=((UC0+dynamicWindow)<<8)|c; 998 length = 2; 999 label = OutputBytes; 1000 return label; 1001 } else if((window=getWindow(data.fromUDynamicOffsets))>=0){ 1002 /*there is a dynamic window that contains this character, change to it*/ 1003 isSingleByteMode = true; 1004 dynamicWindow = window; 1005 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 1006 useDynamicWindow(dynamicWindow); 1007 c = ((UC0+dynamicWindow)<<8) | (c- currentOffset) | 0x80; 1008 length = 2; 1009 label = OutputBytes; 1010 return label; 1011 } else if((code=getDynamicOffset())>=0){ 1012 /*define a dynamic window with this character*/ 1013 isSingleByteMode = true; 1014 dynamicWindow = getNextDynamicWindow(); 1015 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; 1016 useDynamicWindow(dynamicWindow); 1017 c = ((UD0+dynamicWindow)<<16) | (code<<8) 1018 |(c - currentOffset) | 0x80; 1019 length = 3; 1020 label = OutputBytes; 1021 return label; 1022 } 1023 } 1024 1025 /*don't know how to compress these character, just write it directly*/ 1026 length = 2; 1027 label = OutputBytes; 1028 return label; 1029 } else if(c<0xe000 && !AfterGetTrailUnicode){ 1030 label = GetTrailUnicode; 1031 return label; 1032 } else if (!AfterGetTrailUnicode){ 1033 /*quote to avoid SCSU tags*/ 1034 c|=UQU<<16; 1035 length = 3; 1036 label = OutputBytes; 1037 return label; 1038 } 1039 1040 if(AfterGetTrailUnicode){ 1041 AfterGetTrailUnicode = false; 1042 } 1043 /*normal end of conversion, prepare for a new character*/ 1044 c = 0; 1045 sourceIndex = nextSourceIndex; 1046 } 1047 } 1048 label = EndLoop; 1049 return label; 1050 } 1051 getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets)1052 private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1053 lead = (char)c; 1054 int label = Loop; 1055 if(source.hasRemaining()){ 1056 /*test the following code unit*/ 1057 trail = source.get(source.position()); 1058 if(UTF16.isTrailSurrogate(trail)){ 1059 source.position(source.position()+1); 1060 ++nextSourceIndex; 1061 c = UCharacter.getCodePoint((char)c, trail); 1062 label = Loop; 1063 } else { 1064 /*this is unmatched lead code unit (1st Surrogate)*/ 1065 /*callback(illegal)*/ 1066 cr = CoderResult.malformedForLength(1); 1067 label = EndLoop; 1068 } 1069 }else { 1070 /*no more input*/ 1071 label = EndLoop; 1072 } 1073 AfterGetTrail = true; 1074 return label; 1075 } 1076 getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets)1077 private int getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1078 int label = EndLoop; 1079 AfterGetTrailUnicode = true; 1080 /*c is surrogate*/ 1081 if(UTF16.isLeadSurrogate((char)c)){ 1082 // getTrailUnicode: 1083 lead = (char)c; 1084 if(source.hasRemaining()){ 1085 /*test the following code unit*/ 1086 trail = source.get(source.position()); 1087 if(UTF16.isTrailSurrogate(trail)){ 1088 source.get(); 1089 ++nextSourceIndex; 1090 c = UCharacter.getCodePoint((char)c, trail); 1091 /*convert this surrogate code point*/ 1092 /*exit this condition tree*/ 1093 } else { 1094 /*this is unmatched lead code unit(1st surrogate)*/ 1095 /*callback(illegal)*/ 1096 cr = CoderResult.malformedForLength(1); 1097 label = EndLoop; 1098 return label; 1099 } 1100 } else { 1101 /*no more input*/ 1102 label = EndLoop; 1103 return label; 1104 } 1105 } else { 1106 /*this is an unmatched trail code point (2nd surrogate)*/ 1107 /*callback (illegal)*/ 1108 cr = CoderResult.malformedForLength(1); 1109 label = EndLoop; 1110 return label; 1111 } 1112 1113 /*compress supplementary character*/ 1114 if((window=getWindow(data.fromUDynamicOffsets))>=0 && 1115 !(source.hasRemaining() && ((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK) < 1116 (0xd800 - 0x3400))){ 1117 /* 1118 * this is the dynamic window that contains this character and the following 1119 * character is not uncompressible, 1120 * change to the window 1121 */ 1122 isSingleByteMode = true; 1123 dynamicWindow = window; 1124 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 1125 useDynamicWindow(dynamicWindow); 1126 c = ((UC0+dynamicWindow)<<8 | (c-currentOffset) | 0x80); 1127 length = 2; 1128 label = OutputBytes; 1129 return label; 1130 } else if(source.hasRemaining() && lead == source.get(source.position()) && (code=getDynamicOffset())>=0){ 1131 /*two supplementary characters in (probably) the same window - define an extended one*/ 1132 isSingleByteMode = true; 1133 dynamicWindow = getNextDynamicWindow(); 1134 currentOffset = data.fromUDynamicOffsets[dynamicWindow] = offset; 1135 useDynamicWindow(dynamicWindow); 1136 c = (UDX<<24) | (dynamicWindow<<21) |(code<<8) |(c - currentOffset) | 0x80; 1137 length = 4; 1138 label = OutputBytes; 1139 return label; 1140 } else { 1141 /*don't know how to compress this character, just write it directly*/ 1142 c = (lead<<16)|trail; 1143 length = 4; 1144 label = OutputBytes; 1145 return label; 1146 } 1147 1148 } 1149 endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets)1150 private void endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1151 /*set the converter state back to UConverter*/ 1152 data.fromUIsSingleByteMode = isSingleByteMode; 1153 data.fromUDynamicWindow = dynamicWindow; 1154 fromUChar32 = c; 1155 LabelLoop = false; 1156 } 1157 1158 @SuppressWarnings("fallthrough") outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets)1159 private int outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1160 int label; 1161 //int targetCapacity = target.limit()-target.position(); 1162 /*write the output character byte from c and length*/ 1163 /*from the first if in the loop we know that targetCapacity>0*/ 1164 if(length<=targetCapacity){ 1165 switch(length){ 1166 /*each branch falls through the next one*/ 1167 case 4: 1168 target.put((byte)(c>>24)); 1169 if(offsets!=null){ 1170 offsets.put(sourceIndex); 1171 } 1172 case 3: 1173 target.put((byte)(c>>16)); 1174 if(offsets!=null){ 1175 offsets.put(sourceIndex); 1176 } 1177 case 2: 1178 target.put((byte)(c>>8)); 1179 if(offsets!=null){ 1180 offsets.put(sourceIndex); 1181 } 1182 case 1: 1183 target.put((byte)c); 1184 if(offsets!=null){ 1185 offsets.put(sourceIndex); 1186 } 1187 default: 1188 /*will never occur*/ 1189 break; 1190 } 1191 targetCapacity-=length; 1192 1193 /*normal end of conversion: prepare for a new character*/ 1194 c = 0; 1195 sourceIndex = nextSourceIndex; 1196 label = Loop; 1197 return label; 1198 } else { 1199 ByteBuffer p = ByteBuffer.wrap(errorBuffer); 1200 /* 1201 * We actually do this backwards here: 1202 * In order to save an intermediate variable, we output 1203 * first to the overflow buffer what does not fit into the 1204 * regular target 1205 */ 1206 /* we know that 0<=targetCapacity<length<=4 */ 1207 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1208 length -= targetCapacity; 1209 switch(length){ 1210 /*each branch falls through the next one*/ 1211 case 4: 1212 p.put((byte)(c>>24)); 1213 case 3: 1214 p.put((byte)(c>>16)); 1215 case 2: 1216 p.put((byte)(c>>8)); 1217 case 1: 1218 p.put((byte)c); 1219 default: 1220 /*will never occur*/ 1221 break; 1222 } 1223 errorBufferLength = length; 1224 1225 /*now output what fits into the regular target*/ 1226 c>>=8*length; //length was reduced by targetCapacity 1227 switch(targetCapacity){ 1228 /*each branch falls through the next one*/ 1229 case 3: 1230 target.put((byte)(c>>16)); 1231 if(offsets!=null){ 1232 offsets.put(sourceIndex); 1233 } 1234 case 2: 1235 target.put((byte)(c>>8)); 1236 if(offsets!=null){ 1237 offsets.put(sourceIndex); 1238 } 1239 case 1: 1240 target.put((byte)c); 1241 if(offsets!=null){ 1242 offsets.put(sourceIndex); 1243 } 1244 default: 1245 break; 1246 } 1247 1248 /*target overflow*/ 1249 targetCapacity = 0; 1250 cr = CoderResult.OVERFLOW; 1251 c = 0; 1252 label = EndLoop; 1253 return label; 1254 } 1255 } 1256 1257 } 1258 1259 @Override newDecoder()1260 public CharsetDecoder newDecoder() { 1261 return new CharsetDecoderSCSU(this); 1262 } 1263 1264 @Override newEncoder()1265 public CharsetEncoder newEncoder() { 1266 return new CharsetEncoderSCSU(this); 1267 } 1268 1269 @Override getUnicodeSetImpl( UnicodeSet setFillIn, int which)1270 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ 1271 CharsetICU.getCompleteUnicodeSet(setFillIn); 1272 } 1273 1274 } 1275