1 /* Copyright 2016 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 #include <stdlib.h> 7 #include <stdint.h> 8 #include <sys/types.h> 9 10 #ifdef CRAS_DBUS 11 #include <dbus/dbus.h> 12 #endif 13 14 #include "cras_utf8.h" 15 #include "cras_util.h" 16 17 static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf }; 18 19 typedef struct u8range { 20 uint8_t min; 21 uint8_t max; 22 } u8range_t; 23 24 static const u8range_t kUTF8TwoByteSeq[] = { 25 { 0xc2, 0xdf }, 26 { 0x80, 0xbf }, 27 { 0, 0 } 28 }; 29 30 static const u8range_t kUTF8ByteSeqE0[] = { 31 { 0xe0, 0xe0 }, 32 { 0xa0, 0xbf }, 33 { 0x80, 0xbf }, 34 { 0, 0 } 35 }; 36 37 static const u8range_t kUTF8ByteSeqE1EC[] = { 38 { 0xe1, 0xec }, 39 { 0x80, 0xbf }, 40 { 0x80, 0xbf }, 41 { 0, 0 } 42 }; 43 44 static const u8range_t kUTF8ByteSeqED[] = { 45 { 0xed, 0xed }, 46 { 0x80, 0x9f }, 47 { 0x80, 0xbf }, 48 { 0, 0 } 49 }; 50 51 static const u8range_t kUTF8ByteSeqEEEF[] = { 52 { 0xee, 0xef }, 53 { 0x80, 0xbf }, 54 { 0x80, 0xbf }, 55 { 0, 0 } 56 }; 57 58 static const u8range_t kUTF8ByteSeqF0[] = { 59 { 0xf0, 0xf0 }, 60 { 0x90, 0xbf }, 61 { 0x80, 0xbf }, 62 { 0x80, 0xbf }, 63 { 0, 0 } 64 }; 65 66 static const u8range_t kUTF8ByteSeqF1F3[] = { 67 { 0xf1, 0xf3 }, 68 { 0x80, 0xbf }, 69 { 0x80, 0xbf }, 70 { 0x80, 0xbf }, 71 { 0, 0 } 72 }; 73 74 static const u8range_t kUTF8ByteSeqF4[] = { 75 { 0xf4, 0xf4 }, 76 { 0x80, 0x8f }, 77 { 0x80, 0xbf }, 78 { 0x80, 0xbf }, 79 { 0, 0 } 80 }; 81 82 static const u8range_t kUTF8NullRange[] = { 83 { 0, 0 } 84 }; 85 86 typedef struct utf8seq { 87 const u8range_t *ranges; 88 } utf8seq_t; 89 90 static const utf8seq_t kUTF8Sequences[] = { 91 { kUTF8TwoByteSeq }, 92 { kUTF8ByteSeqE0 }, 93 { kUTF8ByteSeqE1EC }, 94 { kUTF8ByteSeqED }, 95 { kUTF8ByteSeqEEEF }, 96 { kUTF8ByteSeqF0 }, 97 { kUTF8ByteSeqF1F3 }, 98 { kUTF8ByteSeqF4 }, 99 { kUTF8NullRange } 100 }; 101 102 int valid_utf8_string(const char *string, size_t *bad_pos) 103 { 104 int bom_chars = 0; 105 uint8_t byte; 106 const char *pos = string; 107 int ret = 1; 108 const utf8seq_t *seq = NULL; 109 const u8range_t *range = NULL; 110 111 if (!pos) { 112 ret = 0; 113 goto error; 114 } 115 116 while ((byte = (uint8_t)*(pos++))) { 117 if (!range || range->min == 0) { 118 if (byte < 128) { 119 /* Ascii character. */ 120 continue; 121 } 122 123 if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) { 124 if (byte == kUTF8ByteOrderMask[bom_chars]) { 125 bom_chars++; 126 continue; 127 } else { 128 /* Characters not matching BOM. 129 * Rewind and assume that there is 130 * no BOM. */ 131 bom_chars = 132 ARRAY_SIZE(kUTF8ByteOrderMask); 133 pos = string; 134 continue; 135 } 136 } 137 138 /* Find the matching sequence of characters by 139 * matching the first character in the sequence. 140 */ 141 seq = kUTF8Sequences; 142 while (seq->ranges->min != 0) { 143 if (byte >= seq->ranges->min && 144 byte <= seq->ranges->max) { 145 /* Matching sequence. */ 146 break; 147 } 148 seq++; 149 } 150 151 if (seq->ranges->min == 0) { 152 /* Could not find a matching sequence. */ 153 ret = 0; 154 goto error; 155 } 156 157 /* Found the appropriate sequence. */ 158 range = seq->ranges + 1; 159 continue; 160 } 161 162 if (byte >= range->min && byte <= range->max) { 163 range++; 164 continue; 165 } 166 167 /* This character doesn't belong in UTF8. */ 168 ret = 0; 169 goto error; 170 } 171 172 if (range && range->min != 0) { 173 /* Stopped in the middle of a sequence. */ 174 ret = 0; 175 } 176 177 error: 178 if (bad_pos) 179 *bad_pos = pos - string - 1; 180 return ret; 181 } 182 183 #ifdef CRAS_DBUS 184 /* Use the DBus implementation if available to ensure that the UTF-8 185 * sequences match those expected by the DBus implementation. */ 186 187 int is_utf8_string(const char *string) 188 { 189 return !!dbus_validate_utf8(string, NULL); 190 } 191 192 #else 193 194 int is_utf8_string (const char *string) { 195 return valid_utf8_string(string, NULL); 196 } 197 198 #endif 199