1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_BYTE_ARRAY_UTILS_H
18 #define LATINIME_BYTE_ARRAY_UTILS_H
19 
20 #include <cstdint>
21 
22 #include "defines.h"
23 
24 namespace latinime {
25 
26 /**
27  * Utility methods for reading byte arrays.
28  */
29 class ByteArrayUtils {
30  public:
31     /**
32      * Integer writing
33      *
34      * Each method write a corresponding size integer in a big endian manner.
35      */
writeUintAndAdvancePosition(uint8_t * const buffer,const uint32_t data,const int size,int * const pos)36     static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer,
37             const uint32_t data, const int size, int *const pos) {
38         // size must be in 1 to 4.
39         ASSERT(size >= 1 && size <= 4);
40         switch (size) {
41             case 1:
42                 ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos);
43                 return;
44             case 2:
45                 ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos);
46                 return;
47             case 3:
48                 ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos);
49                 return;
50             case 4:
51                 ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos);
52                 return;
53             default:
54                 break;
55         }
56     }
57 
58     /**
59      * Integer reading
60      *
61      * Each method read a corresponding size integer in a big endian manner.
62      */
readUint32(const uint8_t * const buffer,const int pos)63     static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) {
64         return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16)
65                 ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3];
66     }
67 
readUint24(const uint8_t * const buffer,const int pos)68     static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) {
69         return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2];
70     }
71 
readUint16(const uint8_t * const buffer,const int pos)72     static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) {
73         return (buffer[pos] << 8) ^ buffer[pos + 1];
74     }
75 
readUint8(const uint8_t * const buffer,const int pos)76     static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) {
77         return buffer[pos];
78     }
79 
readUint32AndAdvancePosition(const uint8_t * const buffer,int * const pos)80     static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition(
81             const uint8_t *const buffer, int *const pos) {
82         const uint32_t value = readUint32(buffer, *pos);
83         *pos += 4;
84         return value;
85     }
86 
readSint24AndAdvancePosition(const uint8_t * const buffer,int * const pos)87     static AK_FORCE_INLINE int readSint24AndAdvancePosition(
88             const uint8_t *const buffer, int *const pos) {
89         const uint8_t value = readUint8(buffer, *pos);
90         if (value < 0x80) {
91             return readUint24AndAdvancePosition(buffer, pos);
92         } else {
93             (*pos)++;
94             return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos));
95         }
96     }
97 
readUint24AndAdvancePosition(const uint8_t * const buffer,int * const pos)98     static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition(
99             const uint8_t *const buffer, int *const pos) {
100         const uint32_t value = readUint24(buffer, *pos);
101         *pos += 3;
102         return value;
103     }
104 
readUint16AndAdvancePosition(const uint8_t * const buffer,int * const pos)105     static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition(
106             const uint8_t *const buffer, int *const pos) {
107         const uint16_t value = readUint16(buffer, *pos);
108         *pos += 2;
109         return value;
110     }
111 
readUint8AndAdvancePosition(const uint8_t * const buffer,int * const pos)112     static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition(
113             const uint8_t *const buffer, int *const pos) {
114         return buffer[(*pos)++];
115     }
116 
readUint(const uint8_t * const buffer,const int size,const int pos)117     static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer,
118             const int size, const int pos) {
119         // size must be in 1 to 4.
120         ASSERT(size >= 1 && size <= 4);
121         switch (size) {
122             case 1:
123                 return ByteArrayUtils::readUint8(buffer, pos);
124             case 2:
125                 return ByteArrayUtils::readUint16(buffer, pos);
126             case 3:
127                 return ByteArrayUtils::readUint24(buffer, pos);
128             case 4:
129                 return ByteArrayUtils::readUint32(buffer, pos);
130             default:
131                 return 0;
132         }
133     }
134 
135     /**
136      * Code Point Reading
137      *
138      * 1 byte = bbbbbbbb match
139      * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
140      * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
141      *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
142      *       00011111 would be outside unicode.
143      * else: iso-latin-1 code
144      * This allows for the whole unicode range to be encoded, including chars outside of
145      * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
146      * characters which should never happen anyway (and still work, but take 3 bytes).
147      */
readCodePoint(const uint8_t * const buffer,const int pos)148     static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
149         int p = pos;
150         return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
151     }
152 
readCodePointAndAdvancePosition(const uint8_t * const buffer,const int * const codePointTable,int * const pos)153     static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
154             const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
155         /*
156          * codePointTable is an array to convert the most frequent characters in this dictionary to
157          * 1 byte code points. It is only made of the original code points of the most frequent
158          * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
159          * The original code points are restored by picking the code points at the indices of the
160          * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
161          */
162         const uint8_t firstByte = readUint8(buffer, *pos);
163         if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
164             if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
165                 *pos += 1;
166                 return NOT_A_CODE_POINT;
167             } else {
168                 return readUint24AndAdvancePosition(buffer, pos);
169             }
170         } else {
171             *pos += 1;
172             if (codePointTable) {
173                 return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
174             }
175             return firstByte;
176         }
177     }
178 
179     /**
180      * String (array of code points) Reading
181      *
182      * Reads code points until the terminator is found.
183      */
184     // Returns the length of the string.
readStringAndAdvancePosition(const uint8_t * const buffer,const int maxLength,const int * const codePointTable,int * const outBuffer,int * const pos)185     static int readStringAndAdvancePosition(const uint8_t *const buffer,
186             const int maxLength, const int *const codePointTable, int *const outBuffer,
187             int *const pos) {
188         int length = 0;
189         int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
190         while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
191             outBuffer[length++] = codePoint;
192             codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
193         }
194         return length;
195     }
196 
197     // Advances the position and returns the length of the string.
advancePositionToBehindString(const uint8_t * const buffer,const int maxLength,int * const pos)198     static int advancePositionToBehindString(
199             const uint8_t *const buffer, const int maxLength, int *const pos) {
200         int length = 0;
201         int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
202         while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
203             codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
204             length++;
205         }
206         return length;
207     }
208 
209     /**
210      * String (array of code points) Writing
211      */
writeCodePointsAndAdvancePosition(uint8_t * const buffer,const int * const codePoints,const int codePointCount,const bool writesTerminator,int * const pos)212     static void writeCodePointsAndAdvancePosition(uint8_t *const buffer,
213             const int *const codePoints, const int codePointCount, const bool writesTerminator,
214             int *const pos) {
215         for (int i = 0; i < codePointCount; ++i) {
216             const int codePoint = codePoints[i];
217             if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
218                 break;
219             } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
220                     || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
221                 // three bytes character.
222                 writeUint24AndAdvancePosition(buffer, codePoint, pos);
223             } else {
224                 // one byte character.
225                 writeUint8AndAdvancePosition(buffer, codePoint, pos);
226             }
227         }
228         if (writesTerminator) {
229             writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos);
230         }
231     }
232 
calculateRequiredByteCountToStoreCodePoints(const int * const codePoints,const int codePointCount,const bool writesTerminator)233     static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints,
234             const int codePointCount, const bool writesTerminator) {
235         int byteCount = 0;
236         for (int i = 0; i < codePointCount; ++i) {
237             const int codePoint = codePoints[i];
238             if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
239                 break;
240             } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
241                     || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
242                 // three bytes character.
243                 byteCount += 3;
244             } else {
245                 // one byte character.
246                 byteCount += 1;
247             }
248         }
249         if (writesTerminator) {
250             // The terminator is one byte.
251             byteCount += 1;
252         }
253         return byteCount;
254     }
255 
256  private:
257     DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
258 
259     static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
260     static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
261     static const uint8_t CHARACTER_ARRAY_TERMINATOR;
262 
writeUint32AndAdvancePosition(uint8_t * const buffer,const uint32_t data,int * const pos)263     static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
264             const uint32_t data, int *const pos) {
265         buffer[(*pos)++] = (data >> 24) & 0xFF;
266         buffer[(*pos)++] = (data >> 16) & 0xFF;
267         buffer[(*pos)++] = (data >> 8) & 0xFF;
268         buffer[(*pos)++] = data & 0xFF;
269     }
270 
writeUint24AndAdvancePosition(uint8_t * const buffer,const uint32_t data,int * const pos)271     static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer,
272             const uint32_t data, int *const pos) {
273         buffer[(*pos)++] = (data >> 16) & 0xFF;
274         buffer[(*pos)++] = (data >> 8) & 0xFF;
275         buffer[(*pos)++] = data & 0xFF;
276     }
277 
writeUint16AndAdvancePosition(uint8_t * const buffer,const uint16_t data,int * const pos)278     static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer,
279             const uint16_t data, int *const pos) {
280         buffer[(*pos)++] = (data >> 8) & 0xFF;
281         buffer[(*pos)++] = data & 0xFF;
282     }
283 
writeUint8AndAdvancePosition(uint8_t * const buffer,const uint8_t data,int * const pos)284     static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer,
285             const uint8_t data, int *const pos) {
286         buffer[(*pos)++] = data & 0xFF;
287     }
288 };
289 } // namespace latinime
290 #endif /* LATINIME_BYTE_ARRAY_UTILS_H */
291