1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2016, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.text;
9 
10 /**
11 * A decompression engine implementing the Standard Compression Scheme
12 * for Unicode (SCSU) as outlined in <A
13 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
14 * Report #6</A>.
15 *
16 * <P><STRONG>USAGE</STRONG></P>
17 *
18 * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
19 * straightforward manner to decompress simple strings:</P>
20 *
21 * <PRE>
22 *  byte [] compressed = ... ; // get compressed bytes from somewhere
23 *  String result = UnicodeDecompressor.decompress(compressed);
24 * </PRE>
25 *
26 * <P>The static methods have a fairly large memory footprint.
27 * For finer-grained control over memory usage,
28 * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
29 * iterative decompression:</P>
30 *
31 * <PRE>
32 *  // Decompress an array "bytes" of length "len" using a buffer of 512 chars
33 *  // to the Writer "out"
34 *
35 *  UnicodeDecompressor myDecompressor         = new UnicodeDecompressor();
36 *  final static int    BUFSIZE                = 512;
37 *  char []             charBuffer             = new char [ BUFSIZE ];
38 *  int                 charsWritten           = 0;
39 *  int []              bytesRead              = new int [1];
40 *  int                 totalBytesDecompressed = 0;
41 *  int                 totalCharsWritten      = 0;
42 *
43 *  do {
44 *    // do the decompression
45 *    charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed,
46 *                                             len, bytesRead,
47 *                                             charBuffer, 0, BUFSIZE);
48 *
49 *    // do something with the current set of chars
50 *    out.write(charBuffer, 0, charsWritten);
51 *
52 *    // update the no. of bytes decompressed
53 *    totalBytesDecompressed += bytesRead[0];
54 *
55 *    // update the no. of chars written
56 *    totalCharsWritten += charsWritten;
57 *
58 *  } while(totalBytesDecompressed &lt; len);
59 *
60 *  myDecompressor.reset(); // reuse decompressor
61 * </PRE>
62 *
63 * <P>Decompression is performed according to the standard set forth in
64 * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
65 * Report #6</A></P>
66 *
67 * @see UnicodeCompressor
68 *
69 * @author Stephen F. Booth
70 * @stable ICU 2.4
71 */
72 public final class UnicodeDecompressor implements SCSU
73 {
74     //==========================
75     // Instance variables
76     //==========================
77 
78     /** Alias to current dynamic window */
79     private int       fCurrentWindow   = 0;
80 
81     /** Dynamic compression window offsets */
82     private int []    fOffsets         = new int [ NUMWINDOWS ];
83 
84     /** Current compression mode */
85     private int       fMode            = SINGLEBYTEMODE;
86 
87     /** Size of our internal buffer */
88     private final static int BUFSIZE   = 3;
89 
90     /** Internal buffer for saving state */
91     private byte []   fBuffer          = new byte [BUFSIZE];
92 
93     /** Number of characters in our internal buffer */
94     private int       fBufferLength    = 0;
95 
96 
97     /**
98      * Create a UnicodeDecompressor.
99      * Sets all windows to their default values.
100      * @see #reset
101      * @stable ICU 2.4
102      */
UnicodeDecompressor()103     public UnicodeDecompressor(){
104         reset();              // initialize to defaults
105     }
106 
107     /**
108      * Decompress a byte array into a String.
109      * @param buffer The byte array to decompress.
110      * @return A String containing the decompressed characters.
111      * @see #decompress(byte [], int, int)
112      * @stable ICU 2.4
113      */
decompress(byte [] buffer)114     public static String decompress(byte [] buffer){
115         char [] buf = decompress(buffer, 0, buffer.length);
116         return new String(buf);
117     }
118 
119     /**
120      * Decompress a byte array into a Unicode character array.
121      * @param buffer The byte array to decompress.
122      * @param start The start of the byte run to decompress.
123      * @param limit The limit of the byte run to decompress.
124      * @return A character array containing the decompressed bytes.
125      * @see #decompress(byte [])
126      * @stable ICU 2.4
127      */
decompress(byte [] buffer, int start, int limit)128     public static char [] decompress(byte [] buffer, int start, int limit) {
129         UnicodeDecompressor comp = new UnicodeDecompressor();
130 
131         // use a buffer we know will never overflow
132         // in the worst case, each byte will decompress
133         // to a surrogate pair (buffer must be at least 2 chars)
134         int len = Math.max(2, 2 * (limit - start));
135         char [] temp = new char [len];
136 
137         int charCount = comp.decompress(buffer, start, limit, null,
138                         temp, 0, len);
139 
140         char [] result = new char [charCount];
141         System.arraycopy(temp, 0, result, 0, charCount);
142         return result;
143     }
144 
145     /**
146      * Decompress a byte array into a Unicode character array.
147      *
148      * This function will either completely fill the output buffer,
149      * or consume the entire input.
150      *
151      * @param byteBuffer The byte buffer to decompress.
152      * @param byteBufferStart The start of the byte run to decompress.
153      * @param byteBufferLimit The limit of the byte run to decompress.
154      * @param bytesRead A one-element array.  If not null, on return
155      * the number of bytes read from byteBuffer.
156      * @param charBuffer A buffer to receive the decompressed data.
157      * This buffer must be at minimum two characters in size.
158      * @param charBufferStart The starting offset to which to write
159      * decompressed data.
160      * @param charBufferLimit The limiting offset for writing
161      * decompressed data.
162      * @return The number of Unicode characters written to charBuffer.
163      * @stable ICU 2.4
164      */
decompress(byte [] byteBuffer, int byteBufferStart, int byteBufferLimit, int [] bytesRead, char [] charBuffer, int charBufferStart, int charBufferLimit)165     public int decompress(byte []    byteBuffer,
166               int        byteBufferStart,
167               int        byteBufferLimit,
168               int []     bytesRead,
169               char []    charBuffer,
170               int        charBufferStart,
171               int        charBufferLimit)
172     {
173     // the current position in the source byte buffer
174     int bytePos      = byteBufferStart;
175 
176     // the current position in the target char buffer
177     int ucPos        = charBufferStart;
178 
179         // the current byte from the source buffer
180     int aByte        = 0x00;
181 
182 
183     // charBuffer must be at least 2 chars in size
184     if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2)
185         throw new IllegalArgumentException("charBuffer.length < 2");
186 
187     // if our internal buffer isn't empty, flush its contents
188     // to the output buffer before doing any more decompression
189     if(fBufferLength > 0) {
190 
191         int newBytes = 0;
192 
193         // fill the buffer completely, to guarantee one full character
194         if(fBufferLength != BUFSIZE) {
195         newBytes = fBuffer.length - fBufferLength;
196 
197         // verify there are newBytes bytes in byteBuffer
198         if(byteBufferLimit - byteBufferStart < newBytes)
199             newBytes = byteBufferLimit - byteBufferStart;
200 
201         System.arraycopy(byteBuffer, byteBufferStart,
202                  fBuffer, fBufferLength, newBytes);
203         }
204 
205         // reset buffer length to 0 before recursive call
206         fBufferLength = 0;
207 
208         // call self recursively to decompress the buffer
209         int count = decompress(fBuffer, 0, fBuffer.length, null,
210                    charBuffer, charBufferStart,
211                    charBufferLimit);
212 
213         // update the positions into the arrays
214         ucPos += count;
215         bytePos += newBytes;
216     }
217 
218         // the main decompression loop
219     mainLoop:
220     while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
221         switch(fMode) {
222         case SINGLEBYTEMODE:
223         // single-byte mode decompression loop
224         singleByteModeLoop:
225         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
226         aByte = byteBuffer[bytePos++] & 0xFF;
227         switch(aByte) {
228             // All bytes from 0x80 through 0xFF are remapped
229             // to chars or surrogate pairs according to the
230             // currently active window
231         case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
232         case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
233         case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
234         case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
235         case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
236         case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
237         case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
238         case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
239         case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
240         case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
241         case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
242         case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
243         case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
244         case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
245         case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
246         case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
247         case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
248         case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
249         case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
250         case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
251         case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
252         case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
253         case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
254         case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
255         case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
256         case 0xFD: case 0xFE: case 0xFF:
257             // For offsets <= 0xFFFF, convert to a single char
258             // by adding the window's offset and subtracting
259             // the generic compression offset
260             if(fOffsets[ fCurrentWindow ] <= 0xFFFF) {
261             charBuffer[ucPos++] = (char)
262                 (aByte + fOffsets[ fCurrentWindow ]
263                  - COMPRESSIONOFFSET);
264             }
265             // For offsets > 0x10000, convert to a surrogate pair by
266             // normBase = window's offset - 0x10000
267             // high surr. = 0xD800 + (normBase >> 10)
268             // low  surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
269             else {
270             // make sure there is enough room to write
271             // both characters
272             // if not, save state and break out
273             if((ucPos + 1) >= charBufferLimit) {
274                 --bytePos;
275                 System.arraycopy(byteBuffer, bytePos,
276                          fBuffer, 0,
277                          byteBufferLimit - bytePos);
278                 fBufferLength = byteBufferLimit - bytePos;
279                 bytePos += fBufferLength;
280                 break mainLoop;
281             }
282 
283             int normalizedBase = fOffsets[ fCurrentWindow ]
284                 - 0x10000;
285             charBuffer[ucPos++] = (char)
286                 (0xD800 + (normalizedBase >> 10));
287             charBuffer[ucPos++] = (char)
288                 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F));
289             }
290             break;
291 
292             // bytes from 0x20 through 0x7F are treated as ASCII and
293             // are remapped to chars by padding the high byte
294             // (this is the same as quoting from static window 0)
295             // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D)
296             // are treated as ASCII as well
297         case 0x00: case 0x09: case 0x0A: case 0x0D:
298         case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
299         case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
300         case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
301         case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
302         case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
303         case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
304         case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
305         case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
306         case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
307         case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
308         case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
309         case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
310         case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
311         case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
312         case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
313         case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
314         case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
315         case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
316         case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
317         case 0x7F:
318             charBuffer[ucPos++] = (char) aByte;
319             break;
320 
321             // quote unicode
322         case SQUOTEU:
323             // verify we have two bytes following tag
324             // if not, save state and break out
325             if( (bytePos + 1) >= byteBufferLimit ) {
326             --bytePos;
327             System.arraycopy(byteBuffer, bytePos,
328                      fBuffer, 0,
329                      byteBufferLimit - bytePos);
330             fBufferLength = byteBufferLimit - bytePos;
331             bytePos += fBufferLength;
332             break mainLoop;
333             }
334 
335             aByte = byteBuffer[bytePos++];
336             charBuffer[ucPos++] = (char)
337             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
338             break;
339 
340             // switch to Unicode mode
341         case SCHANGEU:
342             fMode = UNICODEMODE;
343             break singleByteModeLoop;
344             //break;
345 
346             // handle all quote tags
347         case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
348         case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
349             // verify there is a byte following the tag
350             // if not, save state and break out
351             if(bytePos >= byteBufferLimit) {
352             --bytePos;
353             System.arraycopy(byteBuffer, bytePos,
354                      fBuffer, 0,
355                      byteBufferLimit - bytePos);
356             fBufferLength = byteBufferLimit - bytePos;
357             bytePos += fBufferLength;
358             break mainLoop;
359             }
360 
361             // if the byte is in the range 0x00 - 0x7F, use
362             // static window n otherwise, use dynamic window n
363             int dByte = byteBuffer[bytePos++] & 0xFF;
364             charBuffer[ucPos++] = (char)
365             (dByte+ (dByte >= 0x00 && dByte < 0x80
366                  ? sOffsets[aByte - SQUOTE0]
367                  : (fOffsets[aByte - SQUOTE0]
368                     - COMPRESSIONOFFSET)));
369             break;
370 
371             // handle all change tags
372         case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
373         case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
374             fCurrentWindow = aByte - SCHANGE0;
375             break;
376 
377             // handle all define tags
378         case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
379         case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
380             // verify there is a byte following the tag
381             // if not, save state and break out
382             if(bytePos >= byteBufferLimit) {
383             --bytePos;
384             System.arraycopy(byteBuffer, bytePos,
385                      fBuffer, 0,
386                      byteBufferLimit - bytePos);
387             fBufferLength = byteBufferLimit - bytePos;
388             bytePos += fBufferLength;
389             break mainLoop;
390             }
391 
392             fCurrentWindow = aByte - SDEFINE0;
393             fOffsets[fCurrentWindow] =
394             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
395             break;
396 
397             // handle define extended tag
398         case SDEFINEX:
399             // verify we have two bytes following tag
400             // if not, save state and break out
401             if((bytePos + 1) >= byteBufferLimit ) {
402             --bytePos;
403             System.arraycopy(byteBuffer, bytePos,
404                      fBuffer, 0,
405                      byteBufferLimit - bytePos);
406             fBufferLength = byteBufferLimit - bytePos;
407             bytePos += fBufferLength;
408             break mainLoop;
409             }
410 
411             aByte = byteBuffer[bytePos++] & 0xFF;
412             fCurrentWindow = (aByte & 0xE0) >> 5;
413             fOffsets[fCurrentWindow] = 0x10000 +
414             (0x80 * (((aByte & 0x1F) << 8)
415                  | (byteBuffer[bytePos++] & 0xFF)));
416             break;
417 
418             // reserved, shouldn't happen
419         case SRESERVED:
420             break;
421 
422         } // end switch
423         } // end while
424         break;
425 
426         case UNICODEMODE:
427         // unicode mode decompression loop
428         unicodeModeLoop:
429         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
430         aByte = byteBuffer[bytePos++] & 0xFF;
431         switch(aByte) {
432             // handle all define tags
433         case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
434         case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
435             // verify there is a byte following tag
436             // if not, save state and break out
437             if(bytePos >= byteBufferLimit ) {
438             --bytePos;
439             System.arraycopy(byteBuffer, bytePos,
440                      fBuffer, 0,
441                      byteBufferLimit - bytePos);
442             fBufferLength = byteBufferLimit - bytePos;
443             bytePos += fBufferLength;
444             break mainLoop;
445             }
446 
447             fCurrentWindow = aByte - UDEFINE0;
448             fOffsets[fCurrentWindow] =
449             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
450             fMode = SINGLEBYTEMODE;
451             break unicodeModeLoop;
452             //break;
453 
454             // handle define extended tag
455         case UDEFINEX:
456             // verify we have two bytes following tag
457             // if not, save state and break out
458             if((bytePos + 1) >= byteBufferLimit ) {
459             --bytePos;
460             System.arraycopy(byteBuffer, bytePos,
461                      fBuffer, 0,
462                      byteBufferLimit - bytePos);
463             fBufferLength = byteBufferLimit - bytePos;
464             bytePos += fBufferLength;
465             break mainLoop;
466             }
467 
468             aByte = byteBuffer[bytePos++] & 0xFF;
469             fCurrentWindow = (aByte & 0xE0) >> 5;
470             fOffsets[fCurrentWindow] = 0x10000 +
471             (0x80 * (((aByte & 0x1F) << 8)
472                  | (byteBuffer[bytePos++] & 0xFF)));
473             fMode = SINGLEBYTEMODE;
474             break unicodeModeLoop;
475             //break;
476 
477             // handle all change tags
478         case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
479         case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
480             fCurrentWindow = aByte - UCHANGE0;
481             fMode = SINGLEBYTEMODE;
482             break unicodeModeLoop;
483             //break;
484 
485             // quote unicode
486         case UQUOTEU:
487             // verify we have two bytes following tag
488             // if not, save state and break out
489             if(bytePos >= byteBufferLimit  - 1) {
490             --bytePos;
491             System.arraycopy(byteBuffer, bytePos,
492                      fBuffer, 0,
493                      byteBufferLimit - bytePos);
494             fBufferLength = byteBufferLimit - bytePos;
495             bytePos += fBufferLength;
496             break mainLoop;
497             }
498 
499             aByte = byteBuffer[bytePos++];
500             charBuffer[ucPos++] = (char)
501             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
502             break;
503 
504         default:
505             // verify there is a byte following tag
506             // if not, save state and break out
507             if(bytePos >= byteBufferLimit ) {
508             --bytePos;
509             System.arraycopy(byteBuffer, bytePos,
510                      fBuffer, 0,
511                      byteBufferLimit - bytePos);
512             fBufferLength = byteBufferLimit - bytePos;
513             bytePos += fBufferLength;
514             break mainLoop;
515             }
516 
517             charBuffer[ucPos++] = (char)
518             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
519             break;
520 
521         } // end switch
522         } // end while
523         break;
524 
525         } // end switch( fMode )
526     } // end while
527 
528         // fill in output parameter
529     if(bytesRead != null)
530         bytesRead [0] = (bytePos - byteBufferStart);
531 
532         // return # of chars written
533     return (ucPos - charBufferStart);
534     }
535 
536     /**
537      * Reset the decompressor to its initial state.
538      * @stable ICU 2.4
539      */
reset()540     public void reset()
541     {
542         // reset dynamic windows
543         fOffsets[0] = 0x0080;    // Latin-1
544         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
545         fOffsets[2] = 0x0400;    // Cyrillic
546         fOffsets[3] = 0x0600;    // Arabic
547         fOffsets[4] = 0x0900;    // Devanagari
548         fOffsets[5] = 0x3040;    // Hiragana
549         fOffsets[6] = 0x30A0;    // Katakana
550         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
551 
552 
553         fCurrentWindow  = 0;                // Make current window Latin-1
554         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
555         fBufferLength   = 0;                // Empty buffer
556     }
557 }
558