1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2000-2003, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File writejava.c
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   01/11/02    Ram        Creation.
17 *******************************************************************************
18 */
19 #include <stdbool.h>
20 #include "rle.h"
21 /**
22  * The ESCAPE character is used during run-length encoding.  It signals
23  * a run of identical chars.
24  */
25 static const uint16_t ESCAPE = 0xA5A5;
26 
27 /**
28  * The ESCAPE_BYTE character is used during run-length encoding.  It signals
29  * a run of identical bytes.
30  */
31 static const uint8_t ESCAPE_BYTE = (uint8_t)0xA5;
32 
33 /**
34  * Append a byte to the given StringBuffer, packing two bytes into each
35  * character.  The state parameter maintains intermediary data between
36  * calls.
37  * @param state A two-element array, with state[0] == 0 if this is the
38  * first byte of a pair, or state[0] != 0 if this is the second byte
39  * of a pair, in which case state[1] is the first byte.
40  */
41 static uint16_t*
appendEncodedByte(uint16_t * buffer,uint16_t * buffLimit,uint8_t value,uint8_t state[],UErrorCode * status)42 appendEncodedByte(uint16_t* buffer, uint16_t* buffLimit, uint8_t value, uint8_t state[],UErrorCode* status) {
43     if(!status || U_FAILURE(*status)){
44         return NULL;
45     }
46     if (state[0] != 0) {
47         uint16_t c = (uint16_t) ((state[1] << 8) | (((int32_t) value) & 0xFF));
48         if(buffer < buffLimit){
49             *buffer++ = c;
50         }else{
51             *status = U_BUFFER_OVERFLOW_ERROR;
52         }
53         state[0] = 0;
54         return buffer;
55     }
56     else {
57         state[0] = 1;
58         state[1] = value;
59         return buffer;
60     }
61 }
62 /**
63  * Encode a run, possibly a degenerate run (of < 4 values).
64  * @param length The length of the run; must be > 0 && <= 0xFF.
65  */
66 static uint16_t*
encodeRunByte(uint16_t * buffer,uint16_t * bufLimit,uint8_t value,int32_t length,uint8_t state[],UErrorCode * status)67 encodeRunByte(uint16_t* buffer,uint16_t* bufLimit, uint8_t value, int32_t length, uint8_t state[], UErrorCode* status) {
68     if(!status || U_FAILURE(*status)){
69         return NULL;
70     }
71     if (length < 4) {
72         int32_t j=0;
73         for (; j<length; ++j) {
74             if (value == ESCAPE_BYTE) {
75                 buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status);
76             }
77             buffer = appendEncodedByte(buffer,bufLimit, value, state, status);
78         }
79     }
80     else {
81         if (length == ESCAPE_BYTE) {
82             if (value == ESCAPE_BYTE){
83                buffer =  appendEncodedByte(buffer, bufLimit,ESCAPE_BYTE, state,status);
84             }
85             buffer = appendEncodedByte(buffer,bufLimit, value, state, status);
86             --length;
87         }
88         buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status);
89         buffer = appendEncodedByte(buffer,bufLimit, (char)length, state, status);
90         buffer = appendEncodedByte(buffer,bufLimit, value, state, status); /* Don't need to escape this value*/
91     }
92     return buffer;
93 }
94 
95 #define APPEND( buffer, bufLimit, value, num, status) UPRV_BLOCK_MACRO_BEGIN { \
96     if(buffer<bufLimit){                    \
97         *buffer++=(value);                  \
98     }else{                                  \
99         *status = U_BUFFER_OVERFLOW_ERROR;  \
100     }                                       \
101     num++;                                  \
102 } UPRV_BLOCK_MACRO_END
103 
104 /**
105  * Encode a run, possibly a degenerate run (of < 4 values).
106  * @param length The length of the run; must be > 0 && <= 0xFFFF.
107  */
108 static uint16_t*
encodeRunShort(uint16_t * buffer,uint16_t * bufLimit,uint16_t value,int32_t length,UErrorCode * status)109 encodeRunShort(uint16_t* buffer,uint16_t* bufLimit, uint16_t value, int32_t length,UErrorCode* status) {
110     int32_t num=0;
111     if (length < 4) {
112         int j=0;
113         for (; j<length; ++j) {
114             if (value == (int32_t) ESCAPE){
115                 APPEND(buffer,bufLimit,ESCAPE, num, status);
116 
117             }
118             APPEND(buffer,bufLimit,value,num, status);
119         }
120     }
121     else {
122         if (length == (int32_t) ESCAPE) {
123             if (value == (int32_t) ESCAPE){
124                 APPEND(buffer,bufLimit,ESCAPE,num,status);
125 
126             }
127             APPEND(buffer,bufLimit,value,num,status);
128             --length;
129         }
130         APPEND(buffer,bufLimit,ESCAPE,num,status);
131         APPEND(buffer,bufLimit,(uint16_t) length, num,status);
132         APPEND(buffer,bufLimit,(uint16_t)value, num, status); /* Don't need to escape this value */
133     }
134     return buffer;
135 }
136 
137 /**
138  * Construct a string representing a char array.  Use run-length encoding.
139  * A character represents itself, unless it is the ESCAPE character.  Then
140  * the following notations are possible:
141  *   ESCAPE ESCAPE   ESCAPE literal
142  *   ESCAPE n c      n instances of character c
143  * Since an encoded run occupies 3 characters, we only encode runs of 4 or
144  * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
145  * If we encounter a run where n == ESCAPE, we represent this as:
146  *   c ESCAPE n-1 c
147  * The ESCAPE value is chosen so as not to collide with commonly
148  * seen values.
149  */
150 int32_t
usArrayToRLEString(const uint16_t * src,int32_t srcLen,uint16_t * buffer,int32_t bufLen,UErrorCode * status)151 usArrayToRLEString(const uint16_t* src,int32_t srcLen,uint16_t* buffer, int32_t bufLen,UErrorCode* status) {
152     uint16_t* bufLimit =  buffer+bufLen;
153     uint16_t* saveBuffer = buffer;
154     if(buffer < bufLimit){
155         *buffer++ =  (uint16_t)(srcLen>>16);
156         if(buffer<bufLimit){
157             uint16_t runValue = src[0];
158             int32_t runLength = 1;
159             int i=1;
160             *buffer++ = (uint16_t) srcLen;
161 
162             for (; i<srcLen; ++i) {
163                 uint16_t s = src[i];
164                 if (s == runValue && runLength < 0xFFFF){
165                     ++runLength;
166                 }else {
167                     buffer = encodeRunShort(buffer,bufLimit, (uint16_t)runValue, runLength,status);
168                     runValue = s;
169                     runLength = 1;
170                 }
171             }
172             buffer= encodeRunShort(buffer,bufLimit,(uint16_t)runValue, runLength,status);
173         }else{
174             *status = U_BUFFER_OVERFLOW_ERROR;
175         }
176     }else{
177         *status = U_BUFFER_OVERFLOW_ERROR;
178     }
179     return (int32_t)(buffer - saveBuffer);
180 }
181 
182 /**
183  * Construct a string representing a byte array.  Use run-length encoding.
184  * Two bytes are packed into a single char, with a single extra zero byte at
185  * the end if needed.  A byte represents itself, unless it is the
186  * ESCAPE_BYTE.  Then the following notations are possible:
187  *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
188  *   ESCAPE_BYTE n b           n instances of byte b
189  * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
190  * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
191  * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
192  *   b ESCAPE_BYTE n-1 b
193  * The ESCAPE_BYTE value is chosen so as not to collide with commonly
194  * seen values.
195  */
196 int32_t
byteArrayToRLEString(const uint8_t * src,int32_t srcLen,uint16_t * buffer,int32_t bufLen,UErrorCode * status)197 byteArrayToRLEString(const uint8_t* src,int32_t srcLen, uint16_t* buffer,int32_t bufLen, UErrorCode* status) {
198     const uint16_t* saveBuf = buffer;
199     uint16_t* bufLimit =  buffer+bufLen;
200     if(buffer < bufLimit){
201         *buffer++ = ((uint16_t) (srcLen >> 16));
202 
203         if(buffer<bufLimit){
204             uint8_t runValue = src[0];
205             int runLength = 1;
206             uint8_t state[2]= {0};
207             int i=1;
208             *buffer++=((uint16_t) srcLen);
209             for (; i<srcLen; ++i) {
210                 uint8_t b = src[i];
211                 if (b == runValue && runLength < 0xFF){
212                     ++runLength;
213                 }
214                 else {
215                     buffer = encodeRunByte(buffer, bufLimit,runValue, runLength, state,status);
216                     runValue = b;
217                     runLength = 1;
218                 }
219             }
220             buffer = encodeRunByte(buffer,bufLimit, runValue, runLength, state, status);
221 
222             /* We must save the final byte, if there is one, by padding
223              * an extra zero.
224              */
225             if (state[0] != 0) {
226                 buffer = appendEncodedByte(buffer,bufLimit, 0, state ,status);
227             }
228         }else{
229             *status = U_BUFFER_OVERFLOW_ERROR;
230         }
231     }else{
232         *status = U_BUFFER_OVERFLOW_ERROR;
233     }
234     return (int32_t) (buffer - saveBuf);
235 }
236 
237 
238 /**
239  * Construct an array of shorts from a run-length encoded string.
240  */
241 int32_t
rleStringToUCharArray(uint16_t * src,int32_t srcLen,uint16_t * target,int32_t tgtLen,UErrorCode * status)242 rleStringToUCharArray(uint16_t* src, int32_t srcLen, uint16_t* target, int32_t tgtLen, UErrorCode* status) {
243     int32_t length = 0;
244     int32_t ai = 0;
245     int i=2;
246 
247     if(!status || U_FAILURE(*status)){
248         return 0;
249     }
250     /* the source is null terminated */
251     if(srcLen == -1){
252         srcLen = u_strlen(src);
253     }
254     if(srcLen <= 2){
255         return 2;
256     }
257     length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]);
258 
259     if(target == NULL){
260         return length;
261     }
262     if(tgtLen < length){
263         *status = U_BUFFER_OVERFLOW_ERROR;
264         return length;
265     }
266 
267     for (; i<srcLen; ++i) {
268         uint16_t c = src[i];
269         if (c == ESCAPE) {
270             c = src[++i];
271             if (c == ESCAPE) {
272                 target[ai++] = c;
273             } else {
274                 int32_t runLength = (int32_t) c;
275                 uint16_t runValue = src[++i];
276                 int j=0;
277                 for (; j<runLength; ++j) {
278                     target[ai++] = runValue;
279                 }
280             }
281         }
282         else {
283             target[ai++] = c;
284         }
285     }
286 
287     if (ai != length){
288         *status = U_INTERNAL_PROGRAM_ERROR;
289     }
290 
291     return length;
292 }
293 
294 /**
295  * Construct an array of bytes from a run-length encoded string.
296  */
297 int32_t
rleStringToByteArray(uint16_t * src,int32_t srcLen,uint8_t * target,int32_t tgtLen,UErrorCode * status)298 rleStringToByteArray(uint16_t* src, int32_t srcLen, uint8_t* target, int32_t tgtLen, UErrorCode* status) {
299 
300     int32_t length = 0;
301     UBool nextChar = true;
302     uint16_t c = 0;
303     int32_t node = 0;
304     int32_t runLength = 0;
305     int32_t i = 2;
306     int32_t ai=0;
307 
308     if(!status || U_FAILURE(*status)){
309         return 0;
310     }
311     /* the source is null terminated */
312     if(srcLen == -1){
313         srcLen = u_strlen(src);
314     }
315     if(srcLen <= 2){
316         return 2;
317     }
318     length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]);
319 
320     if(target == NULL){
321         return length;
322     }
323     if(tgtLen < length){
324         *status = U_BUFFER_OVERFLOW_ERROR;
325         return length;
326     }
327 
328     for (; ai<tgtLen; ) {
329        /* This part of the loop places the next byte into the local
330         * variable 'b' each time through the loop.  It keeps the
331         * current character in 'c' and uses the boolean 'nextChar'
332         * to see if we've taken both bytes out of 'c' yet.
333         */
334         uint8_t b;
335         if (nextChar) {
336             c = src[i++];
337             b = (uint8_t) (c >> 8);
338             nextChar = false;
339         }
340         else {
341             b = (uint8_t) (c & 0xFF);
342             nextChar = true;
343         }
344 
345        /* This part of the loop is a tiny state machine which handles
346         * the parsing of the run-length encoding.  This would be simpler
347         * if we could look ahead, but we can't, so we use 'node' to
348         * move between three nodes in the state machine.
349         */
350         switch (node) {
351         case 0:
352             /* Normal idle node */
353             if (b == ESCAPE_BYTE) {
354                 node = 1;
355             }
356             else {
357                 target[ai++] = b;
358             }
359             break;
360         case 1:
361            /* We have seen one ESCAPE_BYTE; we expect either a second
362             * one, or a run length and value.
363             */
364             if (b == ESCAPE_BYTE) {
365                 target[ai++] = ESCAPE_BYTE;
366                 node = 0;
367             }
368             else {
369                 runLength = b;
370                 node = 2;
371             }
372             break;
373         case 2:
374             {
375                 int j=0;
376                /* We have seen an ESCAPE_BYTE and length byte.  We interpret
377                 * the next byte as the value to be repeated.
378                 */
379                 for (; j<runLength; ++j){
380                     if(ai<tgtLen){
381                         target[ai++] = b;
382                     }else{
383                         *status = U_BUFFER_OVERFLOW_ERROR;
384                         return ai;
385                     }
386                 }
387                 node = 0;
388                 break;
389             }
390         }
391     }
392 
393     if (node != 0){
394         *status = U_INTERNAL_PROGRAM_ERROR;
395         /*("Bad run-length encoded byte array")*/
396         return 0;
397     }
398 
399 
400     if (i != srcLen){
401         /*("Excess data in RLE byte array string");*/
402         *status = U_INTERNAL_PROGRAM_ERROR;
403         return ai;
404     }
405 
406     return ai;
407 }
408 
409