1 /******************************************************************************
2 
3  @File         PVRTUnicode.cpp
4 
5  @Title        PVRTUnicode
6 
7  @Version       @Version
8 
9  @Copyright    Copyright (c) Imagination Technologies Limited.
10 
11  @Platform     All
12 
13  @Description  A small collection of functions used to decode Unicode formats to
14                individual code points.
15 
16 ******************************************************************************/
17 #include "PVRTUnicode.h"
18 #include <string.h>
19 
20 /****************************************************************************
21 ** Constants
22 ****************************************************************************/
23 const PVRTuint32 c_u32ReplChar = 0xFFFD;
24 
25 #define VALID_ASCII 0x80
26 #define TAIL_MASK 0x3F
27 #define BYTES_PER_TAIL 6
28 
29 #define UTF16_SURG_H_MARK 0xD800
30 #define UTF16_SURG_H_END  0xDBFF
31 #define UTF16_SURG_L_MARK 0xDC00
32 #define UTF16_SURG_L_END  0xDFFF
33 
34 #define UNICODE_NONCHAR_MARK 0xFDD0
35 #define UNICODE_NONCHAR_END  0xFDEF
36 #define UNICODE_RESERVED	 0xFFFE
37 #define UNICODE_MAX			 0x10FFFF
38 
39 #define MAX_LEN 0x8FFF
40 
41 /****************************************************************************
42 ** A table which allows quick lookup to determine the number of bytes of a
43 ** UTF8 code point.
44 ****************************************************************************/
45 const PVRTuint8 c_u8UTF8Lengths[256] =
46 {
47 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
60 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
61 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
62 	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
63 };
64 
65 /****************************************************************************
66 ** A table which allows quick lookup to determine whether a UTF8 sequence
67 ** is 'overlong'.
68 ****************************************************************************/
69 const PVRTuint32 c_u32MinVals[4] =
70 {
71 	0x00000000,		// 0 tail bytes
72 	0x00000080,		// 1 tail bytes
73 	0x00000800,		// 2 tail bytes
74 	0x00010000,		// 3 tail bytes
75 };
76 
77 /*!***************************************************************************
78  @Function			CheckGenericUnicode
79  @Input				c32			A UTF32 character/Unicode code point
80  @Returns			Success or failure.
81  @Description		Checks that the decoded code point is valid.
82 *****************************************************************************/
CheckGenericUnicode(PVRTuint32 c32)83 static bool CheckGenericUnicode(PVRTuint32 c32)
84 {
85 	// Check that this value isn't a UTF16 surrogate mask.
86 	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
87 		return false;
88 	// Check non-char values
89 	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
90 		return false;
91 	// Check reserved values
92 	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
93 		return false;
94 	// Check max value.
95 	if(c32 > UNICODE_MAX)
96 		return false;
97 
98 	return true;
99 }
100 
101 /*!***************************************************************************
102  @Function			PVRTUnicodeUTF8ToUTF32
103  @Input				pUTF8			A UTF8 string, which is null terminated.
104  @Output			aUTF32			An array of Unicode code points.
105  @Returns			Success or failure.
106  @Description		Decodes a UTF8-encoded string in to Unicode code points
107 					(UTF32). If pUTF8 is not null terminated, the results are
108 					undefined.
109 *****************************************************************************/
PVRTUnicodeUTF8ToUTF32(const PVRTuint8 * const pUTF8,CPVRTArray<PVRTuint32> & aUTF32)110 EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
111 {
112 	unsigned int uiTailLen, uiIndex;
113 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
114 	PVRTuint32 c32;
115 
116 	const PVRTuint8* pC = pUTF8;
117 	while(*pC)
118 	{
119 		// Quick optimisation for ASCII characters
120 		while(*pC && *pC < VALID_ASCII)
121 		{
122 			aUTF32.Append(*pC++);
123 		}
124 		// Done
125 		if(!*pC)
126 			break;
127 
128 		c32 = *pC++;
129 		uiTailLen = c_u8UTF8Lengths[c32];
130 
131 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
132 		// Also check to make sure the tail length is inside the provided buffer.
133 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
134 			return PVR_OVERFLOW;
135 
136 		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail.
137 
138 		// Get the data out of each tail byte
139 		uiIndex = 0;
140 		while(uiIndex < uiTailLen)
141 		{
142 			if((pC[uiIndex] & 0xC0) != 0x80)
143 				return PVR_FAIL;		// Invalid tail byte!
144 
145 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
146 			uiIndex++;
147 		}
148 
149 		pC += uiIndex;
150 
151 		// Check overlong values.
152 		if(c32 < c_u32MinVals[uiTailLen])
153 			return PVR_FAIL;
154 
155 		if(!CheckGenericUnicode(c32))
156 			return PVR_FAIL;
157 
158 		// OK
159 		aUTF32.Append(c32);
160 	}
161 
162 	return PVR_SUCCESS;
163 }
164 
165 /*!***************************************************************************
166  @Function			PVRTUnicodeUTF16ToUTF32
167  @Input				pUTF16			A UTF16 string, which is null terminated.
168  @Output			aUTF32			An array of Unicode code points.
169  @Returns			Success or failure.
170  @Description		Decodes a UTF16-encoded string in to Unicode code points
171 					(UTF32). If pUTF16 is not null terminated, the results are
172 					undefined.
173 *****************************************************************************/
PVRTUnicodeUTF16ToUTF32(const PVRTuint16 * const pUTF16,CPVRTArray<PVRTuint32> & aUTF32)174 EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
175 {
176 	const PVRTuint16* pC = pUTF16;
177 
178 	// Determine the number of shorts
179 	while(*++pC && (pC - pUTF16) < MAX_LEN);
180 	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
181 
182 	if(uiBufferLen == MAX_LEN)
183 		return PVR_OVERFLOW;		// Probably not NULL terminated.
184 
185 	// Reset to start.
186 	pC = pUTF16;
187 
188 	PVRTuint32 c32;
189 	while(*pC)
190 	{
191 		// Straight copy. We'll check for surrogate pairs next...
192 		c32 = *pC++;
193 
194 		// Check surrogate pair
195 		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
196 		{
197 			// Make sure the next 2 bytes are in range...
198 			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
199 				return PVR_OVERFLOW;
200 
201 			// Check that the next value is in the low surrogate range
202 			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
203 				return PVR_FAIL;
204 
205 			// Decode
206 			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
207 			pC++;
208 		}
209 
210 		if(!CheckGenericUnicode(c32))
211 			return PVR_FAIL;
212 
213 		// OK
214 		aUTF32.Append(c32);
215 	}
216 
217 	return PVR_SUCCESS;
218 }
219 
220 /*!***************************************************************************
221  @Function			PVRTUnicodeUTF8Length
222  @Input				pUTF8			A UTF8 string, which is null terminated.
223  @Returns			The length of the string, in Unicode code points.
224  @Description		Calculates the length of a UTF8 string. If pUTF8 is
225 					not null terminated, the results are undefined.
226 *****************************************************************************/
PVRTUnicodeUTF8Length(const PVRTuint8 * const pUTF8)227 unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
228 {
229 	const PVRTuint8* pC = pUTF8;
230 
231 	unsigned int charCount = 0;
232 	unsigned int mask;
233 	while(*pC)
234 	{
235 		// Quick optimisation for ASCII characters
236 		const PVRTuint8* pStart = pC;
237 		while(*pC && *pC < VALID_ASCII)
238 			pC++;
239 
240 		charCount += (unsigned int) (pC - pStart);
241 
242 		// Done
243 		if(!*pC)
244 			break;
245 
246 		mask = *pC & 0xF0;
247 		switch(mask)
248 		{
249 		case 0xF0: pC++;
250 		case 0xE0: pC++;
251 		case 0xC0: pC++;
252 			break;
253 		default:
254 			_ASSERT(!"Invalid tail byte!");
255 			return 0;
256 		}
257 
258 		pC++;
259 		charCount++;
260 	}
261 
262 	return charCount;
263 }
264 
265 /*!***************************************************************************
266  @Function			PVRTUnicodeUTF16Length
267  @Input				pUTF16			A UTF16 string, which is null terminated.
268  @Returns			The length of the string, in Unicode code points.
269  @Description		Calculates the length of a UTF16 string.
270 					If pUTF16 is not null terminated, the results are
271 					undefined.
272 *****************************************************************************/
PVRTUnicodeUTF16Length(const PVRTuint16 * const pUTF16)273 unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
274 {
275 	const PVRTuint16* pC = pUTF16;
276 	unsigned int charCount = 0;
277 	while(*pC && (pC - pUTF16) < MAX_LEN)
278 	{
279 		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
280 		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
281 		{
282 			pC += 2;
283 		}
284 		else
285 		{
286 			pC += 1;
287 		}
288 
289 		charCount++;
290 	}
291 
292 	return charCount;
293 }
294 
295 /*!***************************************************************************
296  @Function			PVRTUnicodeValidUTF8
297  @Input				pUTF8			A UTF8 string, which is null terminated.
298  @Returns			true or false
299  @Description		Checks whether the encoding of a UTF8 string is valid.
300 					If pUTF8 is not null terminated, the results are undefined.
301 *****************************************************************************/
PVRTUnicodeValidUTF8(const PVRTuint8 * const pUTF8)302 bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
303 {
304 	unsigned int uiTailLen, uiIndex;
305 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
306 	const PVRTuint8* pC = pUTF8;
307 	while(*pC)
308 	{
309 		// Quick optimisation for ASCII characters
310 		while(*pC && *pC < VALID_ASCII)	pC++;
311 		// Done?
312 		if(!*pC)
313 			break;
314 
315 		PVRTuint32 c32 = *pC++;
316 		uiTailLen = c_u8UTF8Lengths[c32];
317 
318 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
319 		// Also check to make sure the tail length is inside the provided buffer.
320 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
321 			return false;
322 
323 		// Get the data out of each tail byte
324 		uiIndex = 0;
325 		while(uiIndex < uiTailLen)
326 		{
327 			if((pC[uiIndex] & 0xC0) != 0x80)
328 				return false;		// Invalid tail byte!
329 
330 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
331 			uiIndex++;
332 		}
333 
334 		pC += uiIndex;
335 
336 		// Check overlong values.
337 		if(c32 < c_u32MinVals[uiTailLen])
338 			return false;
339 		if(!CheckGenericUnicode(c32))
340 			return false;
341 	}
342 
343 	return true;
344 }
345 
346 /*****************************************************************************
347  End of file (PVRTUnicode.cpp)
348 *****************************************************************************/
349 
350