1 /* Copyright 2016 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include <stdlib.h>
7 #include <stdint.h>
8 #include <sys/types.h>
9 
10 #ifdef CRAS_DBUS
11 #include <dbus/dbus.h>
12 #endif
13 
14 #include "cras_utf8.h"
15 #include "cras_util.h"
16 
17 static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf };
18 
19 typedef struct u8range {
20 	uint8_t min;
21 	uint8_t max;
22 } u8range_t;
23 
24 static const u8range_t kUTF8TwoByteSeq[] = {
25 	{ 0xc2, 0xdf },
26 	{ 0x80, 0xbf },
27 	{ 0, 0 },
28 };
29 
30 static const u8range_t kUTF8ByteSeqE0[] = {
31 	{ 0xe0, 0xe0 },
32 	{ 0xa0, 0xbf },
33 	{ 0x80, 0xbf },
34 	{ 0, 0 },
35 };
36 
37 static const u8range_t kUTF8ByteSeqE1EC[] = {
38 	{ 0xe1, 0xec },
39 	{ 0x80, 0xbf },
40 	{ 0x80, 0xbf },
41 	{ 0, 0 },
42 };
43 
44 static const u8range_t kUTF8ByteSeqED[] = {
45 	{ 0xed, 0xed },
46 	{ 0x80, 0x9f },
47 	{ 0x80, 0xbf },
48 	{ 0, 0 },
49 };
50 
51 static const u8range_t kUTF8ByteSeqEEEF[] = {
52 	{ 0xee, 0xef },
53 	{ 0x80, 0xbf },
54 	{ 0x80, 0xbf },
55 	{ 0, 0 },
56 };
57 
58 static const u8range_t kUTF8ByteSeqF0[] = {
59 	{ 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf },
60 	{ 0x80, 0xbf }, { 0, 0 },
61 };
62 
63 static const u8range_t kUTF8ByteSeqF1F3[] = {
64 	{ 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf },
65 	{ 0x80, 0xbf }, { 0, 0 },
66 };
67 
68 static const u8range_t kUTF8ByteSeqF4[] = {
69 	{ 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf },
70 	{ 0x80, 0xbf }, { 0, 0 },
71 };
72 
73 static const u8range_t kUTF8NullRange[] = { { 0, 0 } };
74 
75 typedef struct utf8seq {
76 	const u8range_t *ranges;
77 } utf8seq_t;
78 
79 static const utf8seq_t kUTF8Sequences[] = {
80 	{ kUTF8TwoByteSeq },  { kUTF8ByteSeqE0 },   { kUTF8ByteSeqE1EC },
81 	{ kUTF8ByteSeqED },   { kUTF8ByteSeqEEEF }, { kUTF8ByteSeqF0 },
82 	{ kUTF8ByteSeqF1F3 }, { kUTF8ByteSeqF4 },   { kUTF8NullRange }
83 };
84 
valid_utf8_string(const char * string,size_t * bad_pos)85 int valid_utf8_string(const char *string, size_t *bad_pos)
86 {
87 	int bom_chars = 0;
88 	uint8_t byte;
89 	const char *pos = string;
90 	int ret = 1;
91 	const utf8seq_t *seq = NULL;
92 	const u8range_t *range = NULL;
93 
94 	if (!pos) {
95 		ret = 0;
96 		goto error;
97 	}
98 
99 	while ((byte = (uint8_t) * (pos++))) {
100 		if (!range || range->min == 0) {
101 			if (byte < 128) {
102 				/* Ascii character. */
103 				continue;
104 			}
105 
106 			if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) {
107 				if (byte == kUTF8ByteOrderMask[bom_chars]) {
108 					bom_chars++;
109 					continue;
110 				} else {
111 					/* Characters not matching BOM.
112 					 * Rewind and assume that there is
113 					 * no BOM. */
114 					bom_chars =
115 						ARRAY_SIZE(kUTF8ByteOrderMask);
116 					pos = string;
117 					continue;
118 				}
119 			}
120 
121 			/* Find the matching sequence of characters by
122 			 * matching the first character in the sequence.
123 			 */
124 			seq = kUTF8Sequences;
125 			while (seq->ranges->min != 0) {
126 				if (byte >= seq->ranges->min &&
127 				    byte <= seq->ranges->max) {
128 					/* Matching sequence. */
129 					break;
130 				}
131 				seq++;
132 			}
133 
134 			if (seq->ranges->min == 0) {
135 				/* Could not find a matching sequence. */
136 				ret = 0;
137 				goto error;
138 			}
139 
140 			/* Found the appropriate sequence. */
141 			range = seq->ranges + 1;
142 			continue;
143 		}
144 
145 		if (byte >= range->min && byte <= range->max) {
146 			range++;
147 			continue;
148 		}
149 
150 		/* This character doesn't belong in UTF8. */
151 		ret = 0;
152 		goto error;
153 	}
154 
155 	if (range && range->min != 0) {
156 		/* Stopped in the middle of a sequence. */
157 		ret = 0;
158 	}
159 
160 error:
161 	if (bad_pos)
162 		*bad_pos = pos - string - 1;
163 	return ret;
164 }
165 
166 #ifdef CRAS_DBUS
167 /* Use the DBus implementation if available to ensure that the UTF-8
168  * sequences match those expected by the DBus implementation. */
169 
is_utf8_string(const char * string)170 int is_utf8_string(const char *string)
171 {
172 	return !!dbus_validate_utf8(string, NULL);
173 }
174 
175 #else
176 
is_utf8_string(const char * string)177 int is_utf8_string(const char *string)
178 {
179 	return valid_utf8_string(string, NULL);
180 }
181 
182 #endif
183