1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2016, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.text; 9 10 /** 11 * A decompression engine implementing the Standard Compression Scheme 12 * for Unicode (SCSU) as outlined in <A 13 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical 14 * Report #6</A>. 15 * 16 * <P><STRONG>USAGE</STRONG></P> 17 * 18 * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a 19 * straightforward manner to decompress simple strings:</P> 20 * 21 * <PRE> 22 * byte [] compressed = ... ; // get compressed bytes from somewhere 23 * String result = UnicodeDecompressor.decompress(compressed); 24 * </PRE> 25 * 26 * <P>The static methods have a fairly large memory footprint. 27 * For finer-grained control over memory usage, 28 * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing 29 * iterative decompression:</P> 30 * 31 * <PRE> 32 * // Decompress an array "bytes" of length "len" using a buffer of 512 chars 33 * // to the Writer "out" 34 * 35 * UnicodeDecompressor myDecompressor = new UnicodeDecompressor(); 36 * final static int BUFSIZE = 512; 37 * char [] charBuffer = new char [ BUFSIZE ]; 38 * int charsWritten = 0; 39 * int [] bytesRead = new int [1]; 40 * int totalBytesDecompressed = 0; 41 * int totalCharsWritten = 0; 42 * 43 * do { 44 * // do the decompression 45 * charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed, 46 * len, bytesRead, 47 * charBuffer, 0, BUFSIZE); 48 * 49 * // do something with the current set of chars 50 * out.write(charBuffer, 0, charsWritten); 51 * 52 * // update the no. of bytes decompressed 53 * totalBytesDecompressed += bytesRead[0]; 54 * 55 * // update the no. of chars written 56 * totalCharsWritten += charsWritten; 57 * 58 * } while(totalBytesDecompressed < len); 59 * 60 * myDecompressor.reset(); // reuse decompressor 61 * </PRE> 62 * 63 * <P>Decompression is performed according to the standard set forth in 64 * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical 65 * Report #6</A></P> 66 * 67 * @see UnicodeCompressor 68 * 69 * @author Stephen F. Booth 70 * @stable ICU 2.4 71 */ 72 public final class UnicodeDecompressor implements SCSU 73 { 74 //========================== 75 // Instance variables 76 //========================== 77 78 /** Alias to current dynamic window */ 79 private int fCurrentWindow = 0; 80 81 /** Dynamic compression window offsets */ 82 private int [] fOffsets = new int [ NUMWINDOWS ]; 83 84 /** Current compression mode */ 85 private int fMode = SINGLEBYTEMODE; 86 87 /** Size of our internal buffer */ 88 private final static int BUFSIZE = 3; 89 90 /** Internal buffer for saving state */ 91 private byte [] fBuffer = new byte [BUFSIZE]; 92 93 /** Number of characters in our internal buffer */ 94 private int fBufferLength = 0; 95 96 97 /** 98 * Create a UnicodeDecompressor. 99 * Sets all windows to their default values. 100 * @see #reset 101 * @stable ICU 2.4 102 */ UnicodeDecompressor()103 public UnicodeDecompressor(){ 104 reset(); // initialize to defaults 105 } 106 107 /** 108 * Decompress a byte array into a String. 109 * @param buffer The byte array to decompress. 110 * @return A String containing the decompressed characters. 111 * @see #decompress(byte [], int, int) 112 * @stable ICU 2.4 113 */ decompress(byte [] buffer)114 public static String decompress(byte [] buffer){ 115 char [] buf = decompress(buffer, 0, buffer.length); 116 return new String(buf); 117 } 118 119 /** 120 * Decompress a byte array into a Unicode character array. 121 * @param buffer The byte array to decompress. 122 * @param start The start of the byte run to decompress. 123 * @param limit The limit of the byte run to decompress. 124 * @return A character array containing the decompressed bytes. 125 * @see #decompress(byte []) 126 * @stable ICU 2.4 127 */ decompress(byte [] buffer, int start, int limit)128 public static char [] decompress(byte [] buffer, int start, int limit) { 129 UnicodeDecompressor comp = new UnicodeDecompressor(); 130 131 // use a buffer we know will never overflow 132 // in the worst case, each byte will decompress 133 // to a surrogate pair (buffer must be at least 2 chars) 134 int len = Math.max(2, 2 * (limit - start)); 135 char [] temp = new char [len]; 136 137 int charCount = comp.decompress(buffer, start, limit, null, 138 temp, 0, len); 139 140 char [] result = new char [charCount]; 141 System.arraycopy(temp, 0, result, 0, charCount); 142 return result; 143 } 144 145 /** 146 * Decompress a byte array into a Unicode character array. 147 * 148 * This function will either completely fill the output buffer, 149 * or consume the entire input. 150 * 151 * @param byteBuffer The byte buffer to decompress. 152 * @param byteBufferStart The start of the byte run to decompress. 153 * @param byteBufferLimit The limit of the byte run to decompress. 154 * @param bytesRead A one-element array. If not null, on return 155 * the number of bytes read from byteBuffer. 156 * @param charBuffer A buffer to receive the decompressed data. 157 * This buffer must be at minimum two characters in size. 158 * @param charBufferStart The starting offset to which to write 159 * decompressed data. 160 * @param charBufferLimit The limiting offset for writing 161 * decompressed data. 162 * @return The number of Unicode characters written to charBuffer. 163 * @stable ICU 2.4 164 */ decompress(byte [] byteBuffer, int byteBufferStart, int byteBufferLimit, int [] bytesRead, char [] charBuffer, int charBufferStart, int charBufferLimit)165 public int decompress(byte [] byteBuffer, 166 int byteBufferStart, 167 int byteBufferLimit, 168 int [] bytesRead, 169 char [] charBuffer, 170 int charBufferStart, 171 int charBufferLimit) 172 { 173 // the current position in the source byte buffer 174 int bytePos = byteBufferStart; 175 176 // the current position in the target char buffer 177 int ucPos = charBufferStart; 178 179 // the current byte from the source buffer 180 int aByte = 0x00; 181 182 183 // charBuffer must be at least 2 chars in size 184 if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2) 185 throw new IllegalArgumentException("charBuffer.length < 2"); 186 187 // if our internal buffer isn't empty, flush its contents 188 // to the output buffer before doing any more decompression 189 if(fBufferLength > 0) { 190 191 int newBytes = 0; 192 193 // fill the buffer completely, to guarantee one full character 194 if(fBufferLength != BUFSIZE) { 195 newBytes = fBuffer.length - fBufferLength; 196 197 // verify there are newBytes bytes in byteBuffer 198 if(byteBufferLimit - byteBufferStart < newBytes) 199 newBytes = byteBufferLimit - byteBufferStart; 200 201 System.arraycopy(byteBuffer, byteBufferStart, 202 fBuffer, fBufferLength, newBytes); 203 } 204 205 // reset buffer length to 0 before recursive call 206 fBufferLength = 0; 207 208 // call self recursively to decompress the buffer 209 int count = decompress(fBuffer, 0, fBuffer.length, null, 210 charBuffer, charBufferStart, 211 charBufferLimit); 212 213 // update the positions into the arrays 214 ucPos += count; 215 bytePos += newBytes; 216 } 217 218 // the main decompression loop 219 mainLoop: 220 while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { 221 switch(fMode) { 222 case SINGLEBYTEMODE: 223 // single-byte mode decompression loop 224 singleByteModeLoop: 225 while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { 226 aByte = byteBuffer[bytePos++] & 0xFF; 227 switch(aByte) { 228 // All bytes from 0x80 through 0xFF are remapped 229 // to chars or surrogate pairs according to the 230 // currently active window 231 case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: 232 case 0x85: case 0x86: case 0x87: case 0x88: case 0x89: 233 case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: 234 case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: 235 case 0x94: case 0x95: case 0x96: case 0x97: case 0x98: 236 case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: 237 case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: 238 case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: 239 case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: 240 case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: 241 case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: 242 case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: 243 case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0: 244 case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5: 245 case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA: 246 case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: 247 case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: 248 case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9: 249 case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: 250 case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3: 251 case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8: 252 case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED: 253 case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2: 254 case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7: 255 case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC: 256 case 0xFD: case 0xFE: case 0xFF: 257 // For offsets <= 0xFFFF, convert to a single char 258 // by adding the window's offset and subtracting 259 // the generic compression offset 260 if(fOffsets[ fCurrentWindow ] <= 0xFFFF) { 261 charBuffer[ucPos++] = (char) 262 (aByte + fOffsets[ fCurrentWindow ] 263 - COMPRESSIONOFFSET); 264 } 265 // For offsets > 0x10000, convert to a surrogate pair by 266 // normBase = window's offset - 0x10000 267 // high surr. = 0xD800 + (normBase >> 10) 268 // low surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F) 269 else { 270 // make sure there is enough room to write 271 // both characters 272 // if not, save state and break out 273 if((ucPos + 1) >= charBufferLimit) { 274 --bytePos; 275 System.arraycopy(byteBuffer, bytePos, 276 fBuffer, 0, 277 byteBufferLimit - bytePos); 278 fBufferLength = byteBufferLimit - bytePos; 279 bytePos += fBufferLength; 280 break mainLoop; 281 } 282 283 int normalizedBase = fOffsets[ fCurrentWindow ] 284 - 0x10000; 285 charBuffer[ucPos++] = (char) 286 (0xD800 + (normalizedBase >> 10)); 287 charBuffer[ucPos++] = (char) 288 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F)); 289 } 290 break; 291 292 // bytes from 0x20 through 0x7F are treated as ASCII and 293 // are remapped to chars by padding the high byte 294 // (this is the same as quoting from static window 0) 295 // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) 296 // are treated as ASCII as well 297 case 0x00: case 0x09: case 0x0A: case 0x0D: 298 case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: 299 case 0x25: case 0x26: case 0x27: case 0x28: case 0x29: 300 case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: 301 case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33: 302 case 0x34: case 0x35: case 0x36: case 0x37: case 0x38: 303 case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: 304 case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42: 305 case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: 306 case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: 307 case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: 308 case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: 309 case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B: 310 case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60: 311 case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: 312 case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A: 313 case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: 314 case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: 315 case 0x75: case 0x76: case 0x77: case 0x78: case 0x79: 316 case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E: 317 case 0x7F: 318 charBuffer[ucPos++] = (char) aByte; 319 break; 320 321 // quote unicode 322 case SQUOTEU: 323 // verify we have two bytes following tag 324 // if not, save state and break out 325 if( (bytePos + 1) >= byteBufferLimit ) { 326 --bytePos; 327 System.arraycopy(byteBuffer, bytePos, 328 fBuffer, 0, 329 byteBufferLimit - bytePos); 330 fBufferLength = byteBufferLimit - bytePos; 331 bytePos += fBufferLength; 332 break mainLoop; 333 } 334 335 aByte = byteBuffer[bytePos++]; 336 charBuffer[ucPos++] = (char) 337 (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); 338 break; 339 340 // switch to Unicode mode 341 case SCHANGEU: 342 fMode = UNICODEMODE; 343 break singleByteModeLoop; 344 //break; 345 346 // handle all quote tags 347 case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3: 348 case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7: 349 // verify there is a byte following the tag 350 // if not, save state and break out 351 if(bytePos >= byteBufferLimit) { 352 --bytePos; 353 System.arraycopy(byteBuffer, bytePos, 354 fBuffer, 0, 355 byteBufferLimit - bytePos); 356 fBufferLength = byteBufferLimit - bytePos; 357 bytePos += fBufferLength; 358 break mainLoop; 359 } 360 361 // if the byte is in the range 0x00 - 0x7F, use 362 // static window n otherwise, use dynamic window n 363 int dByte = byteBuffer[bytePos++] & 0xFF; 364 charBuffer[ucPos++] = (char) 365 (dByte+ (dByte >= 0x00 && dByte < 0x80 366 ? sOffsets[aByte - SQUOTE0] 367 : (fOffsets[aByte - SQUOTE0] 368 - COMPRESSIONOFFSET))); 369 break; 370 371 // handle all change tags 372 case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3: 373 case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7: 374 fCurrentWindow = aByte - SCHANGE0; 375 break; 376 377 // handle all define tags 378 case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3: 379 case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7: 380 // verify there is a byte following the tag 381 // if not, save state and break out 382 if(bytePos >= byteBufferLimit) { 383 --bytePos; 384 System.arraycopy(byteBuffer, bytePos, 385 fBuffer, 0, 386 byteBufferLimit - bytePos); 387 fBufferLength = byteBufferLimit - bytePos; 388 bytePos += fBufferLength; 389 break mainLoop; 390 } 391 392 fCurrentWindow = aByte - SDEFINE0; 393 fOffsets[fCurrentWindow] = 394 sOffsetTable[byteBuffer[bytePos++] & 0xFF]; 395 break; 396 397 // handle define extended tag 398 case SDEFINEX: 399 // verify we have two bytes following tag 400 // if not, save state and break out 401 if((bytePos + 1) >= byteBufferLimit ) { 402 --bytePos; 403 System.arraycopy(byteBuffer, bytePos, 404 fBuffer, 0, 405 byteBufferLimit - bytePos); 406 fBufferLength = byteBufferLimit - bytePos; 407 bytePos += fBufferLength; 408 break mainLoop; 409 } 410 411 aByte = byteBuffer[bytePos++] & 0xFF; 412 fCurrentWindow = (aByte & 0xE0) >> 5; 413 fOffsets[fCurrentWindow] = 0x10000 + 414 (0x80 * (((aByte & 0x1F) << 8) 415 | (byteBuffer[bytePos++] & 0xFF))); 416 break; 417 418 // reserved, shouldn't happen 419 case SRESERVED: 420 break; 421 422 } // end switch 423 } // end while 424 break; 425 426 case UNICODEMODE: 427 // unicode mode decompression loop 428 unicodeModeLoop: 429 while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { 430 aByte = byteBuffer[bytePos++] & 0xFF; 431 switch(aByte) { 432 // handle all define tags 433 case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3: 434 case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7: 435 // verify there is a byte following tag 436 // if not, save state and break out 437 if(bytePos >= byteBufferLimit ) { 438 --bytePos; 439 System.arraycopy(byteBuffer, bytePos, 440 fBuffer, 0, 441 byteBufferLimit - bytePos); 442 fBufferLength = byteBufferLimit - bytePos; 443 bytePos += fBufferLength; 444 break mainLoop; 445 } 446 447 fCurrentWindow = aByte - UDEFINE0; 448 fOffsets[fCurrentWindow] = 449 sOffsetTable[byteBuffer[bytePos++] & 0xFF]; 450 fMode = SINGLEBYTEMODE; 451 break unicodeModeLoop; 452 //break; 453 454 // handle define extended tag 455 case UDEFINEX: 456 // verify we have two bytes following tag 457 // if not, save state and break out 458 if((bytePos + 1) >= byteBufferLimit ) { 459 --bytePos; 460 System.arraycopy(byteBuffer, bytePos, 461 fBuffer, 0, 462 byteBufferLimit - bytePos); 463 fBufferLength = byteBufferLimit - bytePos; 464 bytePos += fBufferLength; 465 break mainLoop; 466 } 467 468 aByte = byteBuffer[bytePos++] & 0xFF; 469 fCurrentWindow = (aByte & 0xE0) >> 5; 470 fOffsets[fCurrentWindow] = 0x10000 + 471 (0x80 * (((aByte & 0x1F) << 8) 472 | (byteBuffer[bytePos++] & 0xFF))); 473 fMode = SINGLEBYTEMODE; 474 break unicodeModeLoop; 475 //break; 476 477 // handle all change tags 478 case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3: 479 case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7: 480 fCurrentWindow = aByte - UCHANGE0; 481 fMode = SINGLEBYTEMODE; 482 break unicodeModeLoop; 483 //break; 484 485 // quote unicode 486 case UQUOTEU: 487 // verify we have two bytes following tag 488 // if not, save state and break out 489 if(bytePos >= byteBufferLimit - 1) { 490 --bytePos; 491 System.arraycopy(byteBuffer, bytePos, 492 fBuffer, 0, 493 byteBufferLimit - bytePos); 494 fBufferLength = byteBufferLimit - bytePos; 495 bytePos += fBufferLength; 496 break mainLoop; 497 } 498 499 aByte = byteBuffer[bytePos++]; 500 charBuffer[ucPos++] = (char) 501 (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); 502 break; 503 504 default: 505 // verify there is a byte following tag 506 // if not, save state and break out 507 if(bytePos >= byteBufferLimit ) { 508 --bytePos; 509 System.arraycopy(byteBuffer, bytePos, 510 fBuffer, 0, 511 byteBufferLimit - bytePos); 512 fBufferLength = byteBufferLimit - bytePos; 513 bytePos += fBufferLength; 514 break mainLoop; 515 } 516 517 charBuffer[ucPos++] = (char) 518 (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); 519 break; 520 521 } // end switch 522 } // end while 523 break; 524 525 } // end switch( fMode ) 526 } // end while 527 528 // fill in output parameter 529 if(bytesRead != null) 530 bytesRead [0] = (bytePos - byteBufferStart); 531 532 // return # of chars written 533 return (ucPos - charBufferStart); 534 } 535 536 /** 537 * Reset the decompressor to its initial state. 538 * @stable ICU 2.4 539 */ reset()540 public void reset() 541 { 542 // reset dynamic windows 543 fOffsets[0] = 0x0080; // Latin-1 544 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A 545 fOffsets[2] = 0x0400; // Cyrillic 546 fOffsets[3] = 0x0600; // Arabic 547 fOffsets[4] = 0x0900; // Devanagari 548 fOffsets[5] = 0x3040; // Hiragana 549 fOffsets[6] = 0x30A0; // Katakana 550 fOffsets[7] = 0xFF00; // Fullwidth ASCII 551 552 553 fCurrentWindow = 0; // Make current window Latin-1 554 fMode = SINGLEBYTEMODE; // Always start in single-byte mode 555 fBufferLength = 0; // Empty buffer 556 } 557 } 558