1 /* Copyright 2016 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include <stdlib.h>
7 #include <stdint.h>
8 #include <sys/types.h>
9 
10 #ifdef CRAS_DBUS
11 #include <dbus/dbus.h>
12 #endif
13 
14 #include "cras_utf8.h"
15 #include "cras_util.h"
16 
17 static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf };
18 
19 typedef struct u8range {
20 	uint8_t min;
21 	uint8_t max;
22 } u8range_t;
23 
24 static const u8range_t kUTF8TwoByteSeq[] = {
25 	{ 0xc2, 0xdf },
26 	{ 0x80, 0xbf },
27 	{ 0, 0 }
28 };
29 
30 static const u8range_t kUTF8ByteSeqE0[] = {
31 	{ 0xe0, 0xe0 },
32 	{ 0xa0, 0xbf },
33 	{ 0x80, 0xbf },
34 	{ 0, 0 }
35 };
36 
37 static const u8range_t kUTF8ByteSeqE1EC[] = {
38 	{ 0xe1, 0xec },
39 	{ 0x80, 0xbf },
40 	{ 0x80, 0xbf },
41 	{ 0, 0 }
42 };
43 
44 static const u8range_t kUTF8ByteSeqED[] = {
45 	{ 0xed, 0xed },
46 	{ 0x80, 0x9f },
47 	{ 0x80, 0xbf },
48 	{ 0, 0 }
49 };
50 
51 static const u8range_t kUTF8ByteSeqEEEF[] = {
52 	{ 0xee, 0xef },
53 	{ 0x80, 0xbf },
54 	{ 0x80, 0xbf },
55 	{ 0, 0 }
56 };
57 
58 static const u8range_t kUTF8ByteSeqF0[] = {
59 	{ 0xf0, 0xf0 },
60 	{ 0x90, 0xbf },
61 	{ 0x80, 0xbf },
62 	{ 0x80, 0xbf },
63 	{ 0, 0 }
64 };
65 
66 static const u8range_t kUTF8ByteSeqF1F3[] = {
67 	{ 0xf1, 0xf3 },
68 	{ 0x80, 0xbf },
69 	{ 0x80, 0xbf },
70 	{ 0x80, 0xbf },
71 	{ 0, 0 }
72 };
73 
74 static const u8range_t kUTF8ByteSeqF4[] = {
75 	{ 0xf4, 0xf4 },
76 	{ 0x80, 0x8f },
77 	{ 0x80, 0xbf },
78 	{ 0x80, 0xbf },
79 	{ 0, 0 }
80 };
81 
82 static const u8range_t kUTF8NullRange[] = {
83 	{ 0, 0 }
84 };
85 
86 typedef struct utf8seq {
87 	const u8range_t *ranges;
88 } utf8seq_t;
89 
90 static const utf8seq_t kUTF8Sequences[] = {
91 	{ kUTF8TwoByteSeq },
92 	{ kUTF8ByteSeqE0 },
93 	{ kUTF8ByteSeqE1EC },
94 	{ kUTF8ByteSeqED },
95 	{ kUTF8ByteSeqEEEF },
96 	{ kUTF8ByteSeqF0 },
97 	{ kUTF8ByteSeqF1F3 },
98 	{ kUTF8ByteSeqF4 },
99 	{ kUTF8NullRange }
100 };
101 
valid_utf8_string(const char * string,size_t * bad_pos)102 int valid_utf8_string(const char *string, size_t *bad_pos)
103 {
104 	int bom_chars = 0;
105 	uint8_t byte;
106 	const char *pos = string;
107 	int ret = 1;
108 	const utf8seq_t *seq = NULL;
109 	const u8range_t *range = NULL;
110 
111 	if (!pos) {
112 		ret = 0;
113 		goto error;
114 	}
115 
116 	while ((byte = (uint8_t)*(pos++))) {
117 		if (!range || range->min == 0) {
118 			if (byte < 128) {
119 				/* Ascii character. */
120 				continue;
121 			}
122 
123 			if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) {
124 				if (byte == kUTF8ByteOrderMask[bom_chars]) {
125 					bom_chars++;
126 					continue;
127 				} else {
128 					/* Characters not matching BOM.
129 					 * Rewind and assume that there is
130 					 * no BOM. */
131 					bom_chars =
132 					        ARRAY_SIZE(kUTF8ByteOrderMask);
133                                         pos = string;
134 					continue;
135 				}
136 			}
137 
138 			/* Find the matching sequence of characters by
139 			 * matching the first character in the sequence.
140 			 */
141 			seq = kUTF8Sequences;
142 			while (seq->ranges->min != 0) {
143 				if (byte >= seq->ranges->min &&
144 				    byte <= seq->ranges->max) {
145 					/* Matching sequence. */
146 					break;
147 				}
148 				seq++;
149 			}
150 
151 			if (seq->ranges->min == 0) {
152 				/* Could not find a matching sequence. */
153 				ret = 0;
154 				goto error;
155 			}
156 
157 			/* Found the appropriate sequence. */
158 			range = seq->ranges + 1;
159 			continue;
160 		}
161 
162 		if (byte >= range->min && byte <= range->max) {
163 			range++;
164 			continue;
165 		}
166 
167 		/* This character doesn't belong in UTF8. */
168 		ret = 0;
169 		goto error;
170 	}
171 
172 	if (range && range->min != 0) {
173 	        /* Stopped in the middle of a sequence. */
174 	        ret = 0;
175 	}
176 
177 error:
178 	if (bad_pos)
179 		*bad_pos = pos - string - 1;
180 	return ret;
181 }
182 
183 #ifdef CRAS_DBUS
184 /* Use the DBus implementation if available to ensure that the UTF-8
185  * sequences match those expected by the DBus implementation. */
186 
is_utf8_string(const char * string)187 int is_utf8_string(const char *string)
188 {
189 	return !!dbus_validate_utf8(string, NULL);
190 }
191 
192 #else
193 
is_utf8_string(const char * string)194 int is_utf8_string (const char *string) {
195 	return valid_utf8_string(string, NULL);
196 }
197 
198 #endif
199