1 /* 2 ******************************************************************************* 3 * Copyright (C) 2008-2011, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.charset; 8 9 import java.nio.ByteBuffer; 10 import java.nio.CharBuffer; 11 import java.nio.IntBuffer; 12 import java.nio.charset.CharsetDecoder; 13 import java.nio.charset.CharsetEncoder; 14 import java.nio.charset.CoderResult; 15 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.text.UTF16; 18 import com.ibm.icu.text.UnicodeSet; 19 20 /** 21 * @author krajwade 22 * 23 */ 24 class CharsetSCSU extends CharsetICU{ 25 /* SCSU definitions --------------------------------------------------------- */ 26 27 /* SCSU command byte values */ 28 //enum { 29 private static final short SQ0=0x01; /* Quote from window pair 0 */ 30 private static final short SQ7=0x08; /* Quote from window pair 7 */ 31 private static final short SDX=0x0B; /* Define a window as extended */ 32 //private static final short Srs=0x0C; /* reserved */ 33 private static final short SQU=0x0E; /* Quote a single Unicode character */ 34 private static final short SCU=0x0F; /* Change to Unicode mode */ 35 private static final short SC0=0x10; /* Select window 0 */ 36 private static final short SC7=0x17; /* Select window 7 */ 37 private static final short SD0=0x18; /* Define and select window 0 */ 38 //private static final short SD7=0x1F; /* Define and select window 7 */ 39 40 private static final short UC0=0xE0; /* Select window 0 */ 41 private static final short UC7=0xE7; /* Select window 7 */ 42 private static final short UD0=0xE8; /* Define and select window 0 */ 43 private static final short UD7=0xEF; /* Define and select window 7 */ 44 private static final short UQU=0xF0; /* Quote a single Unicode character */ 45 private static final short UDX=0xF1; /* Define a Window as extended */ 46 private static final short Urs=0xF2; /* reserved */ 47 // }; 48 49 // enum { 50 /* 51 * Unicode code points from 3400 to E000 are not adressible by 52 * dynamic window, since in these areas no short run alphabets are 53 * found. Therefore add gapOffset to all values from gapThreshold. 54 */ 55 private static final int gapThreshold=0x68; 56 private static final int gapOffset = 0xAC00 ; 57 /* values between reservedStart and fixedThreshold are reserved */ 58 private static final int reservedStart=0xA8; 59 /* use table of predefined fixed offsets for values from fixedThreshold */ 60 private static final int fixedThreshold=0xF9; 61 //}; 62 63 protected byte[] fromUSubstitution = new byte[]{(byte)0x0E,(byte)0xFF, (byte)0xFD}; 64 65 /* constant offsets for the 8 static windows */ 66 private static final int staticOffsets[]={ 67 0x0000, /* ASCII for quoted tags */ 68 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 69 0x0100, /* Latin Extended-A */ 70 0x0300, /* Combining Diacritical Marks */ 71 0x2000, /* General Punctuation */ 72 0x2080, /* Currency Symbols */ 73 0x2100, /* Letterlike Symbols and Number Forms */ 74 0x3000 /* CJK Symbols and punctuation */ 75 }; 76 77 /* initial offsets for the 8 dynamic (sliding) windows */ 78 private static final int initialDynamicOffsets[]={ 79 0x0080, /* Latin-1 */ 80 0x00C0, /* Latin Extended A */ 81 0x0400, /* Cyrillic */ 82 0x0600, /* Arabic */ 83 0x0900, /* Devanagari */ 84 0x3040, /* Hiragana */ 85 0x30A0, /* Katakana */ 86 0xFF00 /* Fullwidth ASCII */ 87 }; 88 89 /* Table of fixed predefined Offsets */ 90 private static final int fixedOffsets[]={ 91 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 92 /* 0xFA */ 0x0250, /* IPA extensions */ 93 /* 0xFB */ 0x0370, /* Greek */ 94 /* 0xFC */ 0x0530, /* Armenian */ 95 /* 0xFD */ 0x3040, /* Hiragana */ 96 /* 0xFE */ 0x30A0, /* Katakana */ 97 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 98 }; 99 100 /* state values */ 101 //enum { 102 private static final int readCommand=0; 103 private static final int quotePairOne=1; 104 private static final int quotePairTwo=2; 105 private static final int quoteOne=3; 106 private static final int definePairOne=4; 107 private static final int definePairTwo=5; 108 private static final int defineOne=6; 109 // }; 110 111 @SuppressWarnings("unused") 112 private final static class SCSUData{ 113 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ 114 int toUDynamicOffsets[] = new int[8] ; 115 int fromUDynamicOffsets[] = new int[8] ; 116 117 /* state machine state - toUnicode */ 118 boolean toUIsSingleByteMode; 119 short toUState; 120 byte toUQuoteWindow, toUDynamicWindow; 121 short toUByteOne; 122 short toUPadding[]; 123 124 /* state machine state - fromUnicode */ 125 boolean fromUIsSingleByteMode; 126 byte fromUDynamicWindow; 127 128 /* 129 * windowUse[] keeps track of the use of the dynamic windows: 130 * At nextWindowUseIndex there is the least recently used window, 131 * and the following windows (in a wrapping manner) are more and more 132 * recently used. 133 * At nextWindowUseIndex-1 there is the most recently used window. 134 */ 135 byte locale; 136 byte nextWindowUseIndex; 137 byte windowUse[] = new byte[8]; 138 SCSUData()139 SCSUData(){ 140 initialize(); 141 } 142 initialize()143 void initialize(){ 144 for(int i=0;i<8;i++){ 145 this.toUDynamicOffsets[i] = initialDynamicOffsets[i]; 146 } 147 this.toUIsSingleByteMode = true; 148 this.toUState = readCommand; 149 this.toUQuoteWindow = 0; 150 this.toUDynamicWindow = 0; 151 this.toUByteOne = 0; 152 this.fromUIsSingleByteMode = true; 153 this.fromUDynamicWindow = 0; 154 for(int i=0;i<8;i++){ 155 this.fromUDynamicOffsets[i] = initialDynamicOffsets[i]; 156 } 157 this.nextWindowUseIndex = 0; 158 switch(this.locale){ 159 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */ 160 /* case l_ja: 161 for(int i=0;i<8;i++){ 162 this.windowUse[i] = initialWindowUse_ja[i]; 163 } 164 break; */ 165 default: 166 for(int i=0;i<8;i++){ 167 this.windowUse[i] = initialWindowUse[i]; 168 } 169 170 } 171 } 172 } 173 174 static final byte initialWindowUse[]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 175 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */ 176 // static final byte initialWindowUse_ja[]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 177 178 //enum { 179 //private static final int lGeneric = 0; 180 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */ 181 // private static final int l_ja = 1; 182 //}; 183 184 private SCSUData extraInfo = null; 185 CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases)186 public CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases){ 187 super(icuCanonicalName, javaCanonicalName, aliases); 188 maxBytesPerChar = 3; 189 minBytesPerChar = 1; 190 maxCharsPerByte = 1; 191 extraInfo = new SCSUData(); 192 } 193 194 class CharsetDecoderSCSU extends CharsetDecoderICU { 195 /* label values for supporting behavior similar to goto in C */ 196 private static final int FastSingle=0; 197 private static final int SingleByteMode=1; 198 private static final int EndLoop=2; 199 200 /* Mode Type */ 201 private static final int ByteMode = 0; 202 private static final int UnicodeMode =1; 203 CharsetDecoderSCSU(CharsetICU cs)204 public CharsetDecoderSCSU(CharsetICU cs) { 205 super(cs); 206 implReset(); 207 } 208 209 //private SCSUData data ; implReset()210 protected void implReset(){ 211 super.implReset(); 212 toULength = 0; 213 extraInfo.initialize(); 214 } 215 216 short b; 217 218 //Get the state machine state 219 private boolean isSingleByteMode ; 220 private short state ; 221 private byte quoteWindow ; 222 private byte dynamicWindow ; 223 private short byteOne; 224 225 226 //sourceIndex=-1 if the current character began in the previous buffer 227 private int sourceIndex ; 228 private int nextSourceIndex ; 229 230 CoderResult cr; 231 SCSUData data ; 232 private boolean LabelLoop;// used to break the while loop 233 decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush)234 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, 235 boolean flush){ 236 data = extraInfo; 237 238 //Get the state machine state 239 isSingleByteMode = data.toUIsSingleByteMode; 240 state = data.toUState; 241 quoteWindow = data.toUQuoteWindow; 242 dynamicWindow = data.toUDynamicWindow; 243 byteOne = data.toUByteOne; 244 245 LabelLoop = true; 246 247 //sourceIndex=-1 if the current character began in the previous buffer 248 sourceIndex = data.toUState == readCommand ? 0: -1 ; 249 nextSourceIndex = 0; 250 251 cr = CoderResult.UNDERFLOW; 252 int labelType = 0; 253 while(LabelLoop){ 254 if(isSingleByteMode){ 255 switch(labelType){ 256 case FastSingle: 257 /*fast path for single-byte mode*/ 258 labelType = fastSingle(source, target, offsets, ByteMode); 259 break; 260 case SingleByteMode: 261 /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */ 262 labelType = singleByteMode(source, target, offsets, ByteMode); 263 break; 264 case EndLoop: 265 endLoop(source, target, offsets); 266 break; 267 } 268 }else{ 269 switch(labelType){ 270 case FastSingle: 271 /*fast path for single-byte mode*/ 272 labelType = fastSingle(source, target, offsets, UnicodeMode); 273 break; 274 case SingleByteMode: 275 /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */ 276 labelType = singleByteMode(source, target, offsets, UnicodeMode); 277 break; 278 case EndLoop: 279 endLoop(source, target, offsets); 280 break; 281 } 282 //LabelLoop = false; 283 } 284 } 285 return cr; 286 } 287 fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType)288 private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){ 289 int label = 0; 290 if(modeType==ByteMode){ 291 292 if(state==readCommand){ 293 while(source.hasRemaining() && target.hasRemaining() && (b=(short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK)) >= 0x20){ 294 source.position(source.position()+1); 295 ++nextSourceIndex; 296 if(b <= 0x7f){ 297 /*Write US graphic character or DEL*/ 298 target.put((char)b); 299 if(offsets != null){ 300 offsets.put(sourceIndex); 301 } 302 }else{ 303 /*Write from dynamic window*/ 304 int c = data.toUDynamicOffsets[dynamicWindow] + (b&0x7f); 305 if(c <= 0xffff){ 306 target.put((char)c); 307 if(offsets != null){ 308 offsets.put(sourceIndex); 309 } 310 }else{ 311 /*Output surrogate pair */ 312 target.put((char)(0xd7c0 + (c>>10))); 313 if(target.hasRemaining()){ 314 target.put((char)(0xdc00 | (c&0x3ff))); 315 if(offsets != null){ 316 offsets.put(sourceIndex); 317 offsets.put(sourceIndex); 318 } 319 }else{ 320 /* target overflow */ 321 if(offsets != null){ 322 offsets.put(sourceIndex); 323 } 324 charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff)); 325 charErrorBufferLength = 1; 326 label = EndLoop; 327 cr = CoderResult.OVERFLOW; 328 return label; 329 } 330 } 331 } 332 sourceIndex = nextSourceIndex; 333 } 334 // label = SingleByteMode; 335 } 336 }else if(modeType==UnicodeMode){ 337 /* fast path for unicode mode */ 338 if(state == readCommand){ 339 while((source.position()+1)<source.limit() && target.hasRemaining() && (((b=source.get(source.position()))-UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs-UC0)){ 340 target.put((char)((b<<8)|(source.get(source.position()+1)&UConverterConstants.UNSIGNED_BYTE_MASK))); 341 if(offsets != null){ 342 offsets.put(sourceIndex); 343 } 344 sourceIndex = nextSourceIndex; 345 nextSourceIndex+=2; 346 source.position(source.position()+2); 347 } 348 } 349 } 350 label = SingleByteMode; 351 return label; 352 } 353 singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType)354 private int singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){ 355 int label = SingleByteMode; 356 if(modeType == ByteMode){ 357 while(source.hasRemaining()){ 358 if(!target.hasRemaining()){ 359 cr = CoderResult.OVERFLOW; 360 label = EndLoop; 361 return label; 362 } 363 b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK); 364 ++nextSourceIndex; 365 switch(state){ 366 case readCommand: 367 /*redundant conditions are commented out */ 368 if(((1L<<b)&0x2601)!=0){ 369 target.put((char)b); 370 if(offsets != null){ 371 offsets.put(sourceIndex); 372 } 373 sourceIndex = nextSourceIndex; 374 label = FastSingle; 375 return label; 376 }else if(SC0 <= b){ 377 if(b<=SC7){ 378 dynamicWindow = (byte)(b-SC0); 379 sourceIndex = nextSourceIndex; 380 label = FastSingle; 381 return label; 382 }else /* if(SD0<=b && b<=SQ7)*/{ 383 dynamicWindow = (byte)(b - SD0); 384 state = defineOne; 385 } 386 }else if(/* SQ0<=b &&*/b <= SQ7){ 387 quoteWindow = (byte)(b - SQ0); 388 state = quoteOne; 389 }else if(b==SDX){ 390 state = definePairOne; 391 }else if(b==SQU){ 392 state = quotePairOne; 393 }else if(b==SCU){ 394 sourceIndex = nextSourceIndex; 395 isSingleByteMode = false; 396 label = FastSingle; 397 return label; 398 }else{ 399 /*callback (illegal)*/ 400 cr = CoderResult.malformedForLength(1); 401 toUBytesArray[0] = (byte)b; 402 toULength =1; 403 label = EndLoop; 404 return label; 405 } 406 407 /* Store the first byte of a multibyte sequence in toUByte[] */ 408 toUBytesArray[0] = (byte)b; 409 toULength = 1; 410 break; 411 case quotePairOne: 412 byteOne = b; 413 toUBytesArray[1] = (byte)b; 414 toULength = 2; 415 state = quotePairTwo; 416 break; 417 case quotePairTwo: 418 target.put((char)((byteOne<< 8) | b)); 419 if(offsets != null){ 420 offsets.put(sourceIndex); 421 } 422 sourceIndex = nextSourceIndex; 423 state = readCommand; 424 label = FastSingle; 425 return label; 426 case quoteOne: 427 if(b<0x80){ 428 /* all static offsets are in the BMP */ 429 target.put((char)(staticOffsets[quoteWindow] + b)); 430 if(offsets != null){ 431 offsets.put(sourceIndex); 432 } 433 }else { 434 /*write from dynamic window */ 435 int c = data.toUDynamicOffsets[quoteWindow] + (b&0x7f); 436 if(c<=0xffff){ 437 target.put((char)c); 438 if(offsets != null){ 439 offsets.put(sourceIndex); 440 } 441 }else { 442 /* output surrogate pair */ 443 target.put((char)(0xd7c0+(c>>10))); 444 if(target.hasRemaining()){ 445 target.put((char)(0xdc00 | (c&0x3ff))); 446 if(offsets != null){ 447 offsets.put(sourceIndex); 448 offsets.put(sourceIndex); 449 } 450 }else { 451 /* target overflow */ 452 if(offsets != null){ 453 offsets.put(sourceIndex); 454 } 455 charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff)); 456 charErrorBufferLength = 1; 457 label = EndLoop; 458 cr = CoderResult.OVERFLOW; 459 LabelLoop = false; 460 return label; 461 } 462 } 463 } 464 sourceIndex = nextSourceIndex; 465 state = readCommand; 466 label = FastSingle; 467 return label; 468 case definePairOne: 469 dynamicWindow = (byte)((b>>5)&7); 470 byteOne = (byte)(b&0x1f); 471 toUBytesArray[1] = (byte)b; 472 toULength = 2; 473 state = definePairTwo; 474 break; 475 case definePairTwo: 476 data.toUDynamicOffsets[dynamicWindow] = 0x10000 + (byteOne<<15L | b<<7L); 477 sourceIndex = nextSourceIndex; 478 state = readCommand; 479 label = FastSingle; 480 return label; 481 case defineOne: 482 if(b==0){ 483 /*callback (illegal)*/ 484 toUBytesArray[1] = (byte)b; 485 toULength =2; 486 label = EndLoop; 487 return label; 488 }else if(b<gapThreshold){ 489 data.toUDynamicOffsets[dynamicWindow] = b<<7L; 490 }else if(((b - gapThreshold)&UConverterConstants.UNSIGNED_BYTE_MASK)<(reservedStart - gapThreshold)){ 491 data.toUDynamicOffsets[dynamicWindow] = (b<<7L) + gapOffset; 492 }else if(b>=fixedThreshold){ 493 data.toUDynamicOffsets[dynamicWindow] = fixedOffsets[b-fixedThreshold]; 494 }else{ 495 /*callback (illegal)*/ 496 toUBytesArray[1] = (byte)b; 497 toULength =2; 498 label = EndLoop; 499 return label; 500 } 501 sourceIndex = nextSourceIndex; 502 state = readCommand; 503 label = FastSingle; 504 return label; 505 } 506 } 507 508 }else if(modeType==UnicodeMode){ 509 while(source.hasRemaining()){ 510 if(!target.hasRemaining()){ 511 cr = CoderResult.OVERFLOW; 512 label = EndLoop; 513 return label; 514 } 515 b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK); 516 ++nextSourceIndex; 517 switch(state){ 518 case readCommand: 519 if((short)((b -UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs - UC0)){ 520 byteOne = b; 521 toUBytesArray[0] = (byte)b; 522 toULength = 1; 523 state = quotePairTwo; 524 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UC7){ 525 dynamicWindow = (byte)(b - UC0); 526 sourceIndex = nextSourceIndex; 527 isSingleByteMode = true; 528 label = FastSingle; 529 return label; 530 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UD7){ 531 dynamicWindow = (byte)(b - UD0); 532 isSingleByteMode = true; 533 toUBytesArray[0] = (byte)b; 534 toULength = 1; 535 state = defineOne; 536 label = SingleByteMode; 537 return label; 538 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UDX){ 539 isSingleByteMode = true; 540 toUBytesArray[0] = (byte)b; 541 toULength = 1; 542 state = definePairOne; 543 label = SingleByteMode; 544 return label; 545 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UQU){ 546 toUBytesArray[0] = (byte)b; 547 toULength = 1; 548 state = quotePairOne; 549 }else { 550 /* callback (illegal)*/ 551 cr = CoderResult.malformedForLength(1); 552 toUBytesArray[0] = (byte)b; 553 toULength = 1; 554 label = EndLoop; 555 return label; 556 } 557 break; 558 case quotePairOne: 559 byteOne = b; 560 toUBytesArray[1] = (byte)b; 561 toULength = 2; 562 state = quotePairTwo; 563 break; 564 case quotePairTwo: 565 target.put((char)((byteOne<<8) | b)); 566 if(offsets != null){ 567 offsets.put(sourceIndex); 568 } 569 sourceIndex = nextSourceIndex; 570 state = readCommand; 571 label = FastSingle; 572 return label; 573 } 574 } 575 } 576 label = EndLoop; 577 return label; 578 } 579 endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets)580 private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ 581 if(cr==CoderResult.OVERFLOW){ 582 state = readCommand; 583 }else if(state == readCommand){ 584 toULength = 0; 585 } 586 data.toUIsSingleByteMode = isSingleByteMode; 587 data.toUState = state; 588 data.toUQuoteWindow = quoteWindow; 589 data.toUDynamicWindow = dynamicWindow; 590 data.toUByteOne = byteOne; 591 LabelLoop = false; 592 } 593 } 594 595 class CharsetEncoderSCSU extends CharsetEncoderICU{ CharsetEncoderSCSU(CharsetICU cs)596 public CharsetEncoderSCSU(CharsetICU cs) { 597 super(cs, fromUSubstitution); 598 implReset(); 599 } 600 601 //private SCSUData data; implReset()602 protected void implReset() { 603 super.implReset(); 604 extraInfo.initialize(); 605 } 606 607 /* label values for supporting behavior similar to goto in C */ 608 private static final int Loop=0; 609 private static final int GetTrailUnicode=1; 610 private static final int OutputBytes=2; 611 private static final int EndLoop =3; 612 613 private int delta; 614 private int length; 615 616 ///variables of compression heuristics 617 private int offset; 618 private char lead, trail; 619 private int code; 620 private byte window; 621 622 //Get the state machine state 623 private boolean isSingleByteMode; 624 private byte dynamicWindow ; 625 private int currentOffset; 626 int c; 627 628 SCSUData data ; 629 630 //sourceIndex=-1 if the current character began in the previous buffer 631 private int sourceIndex ; 632 private int nextSourceIndex; 633 private int targetCapacity; 634 635 private boolean LabelLoop;//used to break the while loop 636 private boolean AfterGetTrail;// its value is set to true in order to ignore the code before getTrailSingle: 637 private boolean AfterGetTrailUnicode;// is value is set to true in order to ignore the code before getTrailUnicode: 638 639 CoderResult cr; 640 encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)641 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 642 data = extraInfo; 643 cr = CoderResult.UNDERFLOW; 644 645 //Get the state machine state 646 isSingleByteMode = data.fromUIsSingleByteMode; 647 dynamicWindow = data.fromUDynamicWindow; 648 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 649 c = fromUChar32; 650 651 sourceIndex = c== 0 ? 0: -1 ; 652 nextSourceIndex = 0; 653 654 655 targetCapacity = target.limit()-target.position(); 656 657 //sourceIndex=-1 if the current character began in the previous buffer 658 sourceIndex = c== 0 ? 0: -1 ; 659 nextSourceIndex = 0; 660 661 int labelType = Loop; // set to Loop so that the code starts from loop: 662 LabelLoop = true; 663 AfterGetTrail = false; 664 AfterGetTrailUnicode = false; 665 666 while(LabelLoop){ 667 switch(labelType){ 668 case Loop: 669 labelType = loop(source, target, offsets); 670 break; 671 case GetTrailUnicode: 672 labelType = getTrailUnicode(source, target, offsets); 673 break; 674 case OutputBytes: 675 labelType = outputBytes(source, target, offsets); 676 break; 677 case EndLoop: 678 endLoop(source, target, offsets); 679 break; 680 } 681 } 682 return cr; 683 } 684 getWindow(int[] offsets)685 private byte getWindow(int[] offsets){ 686 int i; 687 for (i=0;i<8;i++){ 688 if(((c-offsets[i]) & UConverterConstants.UNSIGNED_INT_MASK) <= 0x7f){ 689 return (byte)i; 690 } 691 } 692 return -1; 693 } 694 isInOffsetWindowOrDirect(int offsetValue, int a)695 private boolean isInOffsetWindowOrDirect(int offsetValue, int a){ 696 return (a & UConverterConstants.UNSIGNED_INT_MASK)<=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK)+0x7f & 697 ((a & UConverterConstants.UNSIGNED_INT_MASK)>=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK) || 698 ((a & UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && ((a & UConverterConstants.UNSIGNED_INT_MASK)>=0x20 699 || ((1L<<(a & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0))); 700 } 701 getNextDynamicWindow()702 private byte getNextDynamicWindow(){ 703 byte windowValue = data.windowUse[data.nextWindowUseIndex]; 704 if(++data.nextWindowUseIndex==8){ 705 data.nextWindowUseIndex=0; 706 } 707 return windowValue; 708 } 709 useDynamicWindow(byte windowValue)710 private void useDynamicWindow(byte windowValue){ 711 /*first find the index of the window*/ 712 int i,j; 713 i = data.nextWindowUseIndex; 714 do{ 715 if(--i<0){ 716 i=7; 717 } 718 }while(data.windowUse[i]!=windowValue); 719 720 /*now copy each window[i+1] to [i]*/ 721 j= i+1; 722 if(j==8){ 723 j=0; 724 } 725 while(j!=data.nextWindowUseIndex){ 726 data.windowUse[i] = data.windowUse[j]; 727 i=j; 728 if(++j==8){ 729 j=0; 730 } 731 } 732 733 /*finally, set the window into the most recently used index*/ 734 data.windowUse[i]= windowValue; 735 } 736 737 getDynamicOffset()738 private int getDynamicOffset(){ 739 int i; 740 for(i=0;i<7;++i){ 741 if(((c-fixedOffsets[i])&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){ 742 offset = fixedOffsets[i]; 743 return 0xf9+i; 744 } 745 } 746 if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x80){ 747 /*No dynamic window for US-ASCII*/ 748 return -1; 749 }else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x3400 || ((c-0x10000)&UConverterConstants.UNSIGNED_INT_MASK)<(0x14000-0x10000) || 750 ((c-0x1d000)&UConverterConstants.UNSIGNED_INT_MASK)<=(0x1ffff-0x1d000)){ 751 /*This character is in the code range for a "small", i.e, reasonably windowable, script*/ 752 offset = c&0x7fffff80; 753 return (c>>7); 754 }else if(0xe000<=(c&UConverterConstants.UNSIGNED_INT_MASK) && (c&UConverterConstants.UNSIGNED_INT_MASK)!=0xfeff && (c&UConverterConstants.UNSIGNED_INT_MASK) < 0xfff0){ 755 /*for these characters we need to take the gapOffset into account*/ 756 offset=(c)&0x7fffff80; 757 return ((c-gapOffset)>>7); 758 }else{ 759 return -1; 760 } 761 } 762 loop(CharBuffer source, ByteBuffer target, IntBuffer offsets)763 private int loop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 764 int label = 0; 765 if(isSingleByteMode){ 766 if(c!=0 && targetCapacity>0 && !AfterGetTrail){ 767 label = getTrail(source, target, offsets); 768 return label; 769 } 770 /*state machine for single byte mode*/ 771 while(AfterGetTrail || source.hasRemaining()){ 772 if(targetCapacity<=0 && !AfterGetTrail){ 773 /*target is full*/ 774 cr = CoderResult.OVERFLOW; 775 label = EndLoop; 776 return label; 777 } 778 if(!AfterGetTrail){ 779 c = source.get(); 780 ++nextSourceIndex; 781 782 } 783 if(((c -0x20)&UConverterConstants.UNSIGNED_INT_MASK)<=0x5f && !AfterGetTrail){ 784 /*pass US-ASCII graphic character through*/ 785 target.put((byte)c); 786 if(offsets!=null){ 787 offsets.put(sourceIndex); 788 } 789 --targetCapacity; 790 }else if((c & UConverterConstants.UNSIGNED_INT_MASK)<0x20 && !AfterGetTrail){ 791 if(((1L<<(c & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0){ 792 /*CR/LF/TAB/NUL*/ 793 target.put((byte)c); 794 if(offsets!=null){ 795 offsets.put(sourceIndex); 796 } 797 --targetCapacity; 798 } else { 799 /*quote c0 control character*/ 800 c|=SQ0<<8; 801 length = 2; 802 label = OutputBytes; 803 return label; 804 } 805 } else if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && !AfterGetTrail){ 806 /*use the current dynamic window*/ 807 target.put((byte)(delta|0x80)); 808 if(offsets!=null){ 809 offsets.put(sourceIndex); 810 } 811 --targetCapacity; 812 } else if(AfterGetTrail || UTF16.isSurrogate((char)c)){ 813 if(!AfterGetTrail){ 814 if(UTF16.isLeadSurrogate((char)c)){ 815 label = getTrail(source, target, offsets); 816 if(label==EndLoop){ 817 return label; 818 } 819 } else { 820 /*this is unmatched lead code unit (2nd Surrogate)*/ 821 /*callback(illegal)*/ 822 cr = CoderResult.malformedForLength(1); 823 label = EndLoop; 824 return label; 825 } 826 } 827 828 829 if(AfterGetTrail){ 830 AfterGetTrail = false; 831 } 832 833 /*Compress supplementary character U+10000...U+10ffff */ 834 if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){ 835 /*use the current dynamic window*/ 836 target.put((byte)(delta|0x80)); 837 if(offsets!=null){ 838 offsets.put(sourceIndex); 839 } 840 --targetCapacity; 841 } else if((window=getWindow(data.fromUDynamicOffsets))>=0){ 842 /*there is a dynamic window that contains this character, change to it*/ 843 dynamicWindow = window; 844 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 845 useDynamicWindow(dynamicWindow); 846 c = ((SC0+dynamicWindow)<<8 | (c-currentOffset)|0x80); 847 length = 2; 848 label = OutputBytes; 849 return label; 850 } else if((code=getDynamicOffset())>=0){ 851 /*might check if there are come character in this window to come */ 852 /*define an extended window with this character*/ 853 code-=0x200; 854 dynamicWindow=getNextDynamicWindow(); 855 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; 856 useDynamicWindow(dynamicWindow); 857 c = ((SDX<<24) | (dynamicWindow<<21)| 858 (code<<8)| (c- currentOffset) |0x80); 859 // c = (((SDX)<<25) | (dynamicWindow<<21)| 860 // (code<<8)| (c- currentOffset) |0x80 ); 861 length = 4; 862 label = OutputBytes; 863 return label; 864 } else { 865 /*change to unicode mode and output this (lead, trail) pair*/ 866 isSingleByteMode = false; 867 target.put((byte)SCU); 868 if(offsets!=null){ 869 offsets.put(sourceIndex); 870 } 871 --targetCapacity; 872 c = (lead<<16)|trail; 873 length = 4; 874 label = OutputBytes; 875 return label; 876 } 877 } else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0xa0){ 878 /*quote C1 control character*/ 879 c = (c&0x7f) | (SQ0+1)<<8; /*SQ0+1 == SQ1*/ 880 length = 2; 881 label = OutputBytes; 882 return label; 883 } else if((c&UConverterConstants.UNSIGNED_INT_MASK)==0xfeff || (c&UConverterConstants.UNSIGNED_INT_MASK)>= 0xfff0){ 884 /*quote signature character = byte order mark and specials*/ 885 c |= SQU<<16; 886 length = 3; 887 label = OutputBytes; 888 return label; 889 } else { 890 /*compress all other BMP characters*/ 891 if((window=getWindow(data.fromUDynamicOffsets))>=0){ 892 /*there is a window defined that contains this character - switch to it or quote from it*/ 893 if(source.position()>=source.limit() || isInOffsetWindowOrDirect(data.fromUDynamicOffsets[window], source.get(source.position()))){ 894 /*change to dynamic window*/ 895 dynamicWindow = window; 896 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 897 useDynamicWindow(dynamicWindow); 898 c = ((SC0+window)<<8) | (c- currentOffset) | 0x80; 899 length = 2; 900 label = OutputBytes; 901 return label; 902 } else { 903 /*quote from dynamic window*/ 904 c = ((SQ0+window)<<8) | (c - data.fromUDynamicOffsets[window]) | 905 0x80; 906 length = 2; 907 label = OutputBytes; 908 return label; 909 } 910 } else if((window = getWindow(staticOffsets))>=0){ 911 /*quote from static window*/ 912 c = ((SQ0+window)<<8) | (c - staticOffsets[window]); 913 length = 2; 914 label = OutputBytes; 915 return label; 916 }else if((code=getDynamicOffset())>=0){ 917 /*define a dynamic window with this character*/ 918 dynamicWindow = getNextDynamicWindow(); 919 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; 920 useDynamicWindow(dynamicWindow); 921 c = ((SD0+dynamicWindow)<<16) | (code<<8)| 922 (c - currentOffset) | 0x80; 923 length = 3; 924 label = OutputBytes; 925 return label; 926 } else if(((int)((c-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && (source.position()>=source.limit() || 927 ((int)((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))< (0xd800 - 0x3400))){ 928 929 /* 930 * this character is not compressible (a BMP ideograph of similar) 931 * switch to Unicode mode if this is the last character in the block 932 * or there is at least one more ideograph following immediately 933 */ 934 isSingleByteMode = false; 935 c|=SCU<<16; 936 length =3; 937 label = OutputBytes; 938 return label; 939 } else { 940 /*quote Unicode*/ 941 c|=SQU<<16; 942 length = 3; 943 label = OutputBytes; 944 return label; 945 } 946 } 947 /*normal end of conversion : prepare for new character */ 948 c = 0; 949 sourceIndex = nextSourceIndex; 950 } 951 } else { 952 if(c!=0 && targetCapacity>0 && !AfterGetTrailUnicode){ 953 label = GetTrailUnicode; 954 return label; 955 } 956 957 /*state machine for Unicode*/ 958 /*unicodeByteMode*/ 959 while(AfterGetTrailUnicode || source.hasRemaining()){ 960 if(targetCapacity<=0 && !AfterGetTrailUnicode){ 961 /*target is full*/ 962 cr = CoderResult.OVERFLOW; 963 LabelLoop = false; 964 break; 965 } 966 if(!AfterGetTrailUnicode){ 967 c = source.get(); 968 ++nextSourceIndex; 969 } 970 971 if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && !AfterGetTrailUnicode){ 972 /*not compressible, write character directly */ 973 if(targetCapacity>=2){ 974 target.put((byte)(c>>8)); 975 target.put((byte)c); 976 if(offsets!=null){ 977 offsets.put(sourceIndex); 978 offsets.put(sourceIndex); 979 } 980 targetCapacity-=2; 981 } else { 982 length =2; 983 label = OutputBytes; 984 return label; 985 } 986 } else if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300*/&& !AfterGetTrailUnicode){ 987 /*compress BMP character if the following one is not an uncompressible ideograph*/ 988 if(!(source.hasRemaining() && (((source.get(source.position())-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400))){ 989 if(((((c-0x30)&UConverterConstants.UNSIGNED_INT_MASK))<10 || (((c-0x61)&UConverterConstants.UNSIGNED_INT_MASK))<26 990 || (((c-0x41)&UConverterConstants.UNSIGNED_INT_MASK))<26)){ 991 /*ASCII digit or letter*/ 992 isSingleByteMode = true; 993 c |=((UC0+dynamicWindow)<<8)|c; 994 length = 2; 995 label = OutputBytes; 996 return label; 997 } else if((window=getWindow(data.fromUDynamicOffsets))>=0){ 998 /*there is a dynamic window that contains this character, change to it*/ 999 isSingleByteMode = true; 1000 dynamicWindow = window; 1001 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 1002 useDynamicWindow(dynamicWindow); 1003 c = ((UC0+dynamicWindow)<<8) | (c- currentOffset) | 0x80; 1004 length = 2; 1005 label = OutputBytes; 1006 return label; 1007 } else if((code=getDynamicOffset())>=0){ 1008 /*define a dynamic window with this character*/ 1009 isSingleByteMode = true; 1010 dynamicWindow = getNextDynamicWindow(); 1011 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; 1012 useDynamicWindow(dynamicWindow); 1013 c = ((UD0+dynamicWindow)<<16) | (code<<8) 1014 |(c - currentOffset) | 0x80; 1015 length = 3; 1016 label = OutputBytes; 1017 return label; 1018 } 1019 } 1020 1021 /*don't know how to compress these character, just write it directly*/ 1022 length = 2; 1023 label = OutputBytes; 1024 return label; 1025 } else if(c<0xe000 && !AfterGetTrailUnicode){ 1026 label = GetTrailUnicode; 1027 return label; 1028 } else if (!AfterGetTrailUnicode){ 1029 /*quote to avoid SCSU tags*/ 1030 c|=UQU<<16; 1031 length = 3; 1032 label = OutputBytes; 1033 return label; 1034 } 1035 1036 if(AfterGetTrailUnicode){ 1037 AfterGetTrailUnicode = false; 1038 } 1039 /*normal end of conversion, prepare for a new character*/ 1040 c = 0; 1041 sourceIndex = nextSourceIndex; 1042 } 1043 } 1044 label = EndLoop; 1045 return label; 1046 } 1047 getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets)1048 private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1049 lead = (char)c; 1050 int label = Loop; 1051 if(source.hasRemaining()){ 1052 /*test the following code unit*/ 1053 trail = source.get(source.position()); 1054 if(UTF16.isTrailSurrogate(trail)){ 1055 source.position(source.position()+1); 1056 ++nextSourceIndex; 1057 c = UCharacter.getCodePoint((char)c, trail); 1058 label = Loop; 1059 } else { 1060 /*this is unmatched lead code unit (1st Surrogate)*/ 1061 /*callback(illegal)*/ 1062 cr = CoderResult.malformedForLength(1); 1063 label = EndLoop; 1064 } 1065 }else { 1066 /*no more input*/ 1067 label = EndLoop; 1068 } 1069 AfterGetTrail = true; 1070 return label; 1071 } 1072 getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets)1073 private int getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1074 int label = EndLoop; 1075 AfterGetTrailUnicode = true; 1076 /*c is surrogate*/ 1077 if(UTF16.isLeadSurrogate((char)c)){ 1078 // getTrailUnicode: 1079 lead = (char)c; 1080 if(source.hasRemaining()){ 1081 /*test the following code unit*/ 1082 trail = source.get(source.position()); 1083 if(UTF16.isTrailSurrogate(trail)){ 1084 source.get(); 1085 ++nextSourceIndex; 1086 c = UCharacter.getCodePoint((char)c, trail); 1087 /*convert this surrogate code point*/ 1088 /*exit this condition tree*/ 1089 } else { 1090 /*this is unmatched lead code unit(1st surrogate)*/ 1091 /*callback(illegal)*/ 1092 cr = CoderResult.malformedForLength(1); 1093 label = EndLoop; 1094 return label; 1095 } 1096 } else { 1097 /*no more input*/ 1098 label = EndLoop; 1099 return label; 1100 } 1101 } else { 1102 /*this is an unmatched trail code point (2nd surrogate)*/ 1103 /*callback (illegal)*/ 1104 cr = CoderResult.malformedForLength(1); 1105 label = EndLoop; 1106 return label; 1107 } 1108 1109 /*compress supplementary character*/ 1110 if((window=getWindow(data.fromUDynamicOffsets))>=0 && 1111 !(source.hasRemaining() && ((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK) < 1112 (0xd800 - 0x3400))){ 1113 /* 1114 * this is the dynamic window that contains this character and the following 1115 * character is not uncompressible, 1116 * change to the window 1117 */ 1118 isSingleByteMode = true; 1119 dynamicWindow = window; 1120 currentOffset = data.fromUDynamicOffsets[dynamicWindow]; 1121 useDynamicWindow(dynamicWindow); 1122 c = ((UC0+dynamicWindow)<<8 | (c-currentOffset) | 0x80); 1123 length = 2; 1124 label = OutputBytes; 1125 return label; 1126 } else if(source.hasRemaining() && lead == source.get(source.position()) && (code=getDynamicOffset())>=0){ 1127 /*two supplementary characters in (probably) the same window - define an extended one*/ 1128 isSingleByteMode = true; 1129 dynamicWindow = getNextDynamicWindow(); 1130 currentOffset = data.fromUDynamicOffsets[dynamicWindow] = offset; 1131 useDynamicWindow(dynamicWindow); 1132 c = (UDX<<24) | (dynamicWindow<<21) |(code<<8) |(c - currentOffset) | 0x80; 1133 length = 4; 1134 label = OutputBytes; 1135 return label; 1136 } else { 1137 /*don't know how to compress this character, just write it directly*/ 1138 c = (lead<<16)|trail; 1139 length = 4; 1140 label = OutputBytes; 1141 return label; 1142 } 1143 1144 } 1145 endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets)1146 private void endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1147 /*set the converter state back to UConverter*/ 1148 data.fromUIsSingleByteMode = isSingleByteMode; 1149 data.fromUDynamicWindow = dynamicWindow; 1150 fromUChar32 = c; 1151 LabelLoop = false; 1152 } 1153 1154 @SuppressWarnings("fallthrough") outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets)1155 private int outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets){ 1156 int label; 1157 //int targetCapacity = target.limit()-target.position(); 1158 /*write the output character byte from c and length*/ 1159 /*from the first if in the loop we know that targetCapacity>0*/ 1160 if(length<=targetCapacity){ 1161 switch(length){ 1162 /*each branch falls through the next one*/ 1163 case 4: 1164 target.put((byte)(c>>24)); 1165 if(offsets!=null){ 1166 offsets.put(sourceIndex); 1167 } 1168 case 3: 1169 target.put((byte)(c>>16)); 1170 if(offsets!=null){ 1171 offsets.put(sourceIndex); 1172 } 1173 case 2: 1174 target.put((byte)(c>>8)); 1175 if(offsets!=null){ 1176 offsets.put(sourceIndex); 1177 } 1178 case 1: 1179 target.put((byte)c); 1180 if(offsets!=null){ 1181 offsets.put(sourceIndex); 1182 } 1183 default: 1184 /*will never occur*/ 1185 break; 1186 } 1187 targetCapacity-=length; 1188 1189 /*normal end of conversion: prepare for a new character*/ 1190 c = 0; 1191 sourceIndex = nextSourceIndex; 1192 label = Loop; 1193 return label; 1194 } else { 1195 ByteBuffer p = ByteBuffer.wrap(errorBuffer); 1196 /* 1197 * We actually do this backwards here: 1198 * In order to save an intermediate variable, we output 1199 * first to the overflow buffer what does not fit into the 1200 * regular target 1201 */ 1202 /* we know that 0<=targetCapacity<length<=4 */ 1203 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1204 length -= targetCapacity; 1205 switch(length){ 1206 /*each branch falls through the next one*/ 1207 case 4: 1208 p.put((byte)(c>>24)); 1209 case 3: 1210 p.put((byte)(c>>16)); 1211 case 2: 1212 p.put((byte)(c>>8)); 1213 case 1: 1214 p.put((byte)c); 1215 default: 1216 /*will never occur*/ 1217 break; 1218 } 1219 errorBufferLength = length; 1220 1221 /*now output what fits into the regular target*/ 1222 c>>=8*length; //length was reduced by targetCapacity 1223 switch(targetCapacity){ 1224 /*each branch falls through the next one*/ 1225 case 3: 1226 target.put((byte)(c>>16)); 1227 if(offsets!=null){ 1228 offsets.put(sourceIndex); 1229 } 1230 case 2: 1231 target.put((byte)(c>>8)); 1232 if(offsets!=null){ 1233 offsets.put(sourceIndex); 1234 } 1235 case 1: 1236 target.put((byte)c); 1237 if(offsets!=null){ 1238 offsets.put(sourceIndex); 1239 } 1240 default: 1241 break; 1242 } 1243 1244 /*target overflow*/ 1245 targetCapacity = 0; 1246 cr = CoderResult.OVERFLOW; 1247 c = 0; 1248 label = EndLoop; 1249 return label; 1250 } 1251 } 1252 1253 } 1254 newDecoder()1255 public CharsetDecoder newDecoder() { 1256 return new CharsetDecoderSCSU(this); 1257 } 1258 newEncoder()1259 public CharsetEncoder newEncoder() { 1260 return new CharsetEncoderSCSU(this); 1261 } 1262 getUnicodeSetImpl( UnicodeSet setFillIn, int which)1263 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ 1264 CharsetICU.getCompleteUnicodeSet(setFillIn); 1265 } 1266 1267 } 1268