1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <gtest/gtest.h>
18 #include <unicode/utf.h>
19 #include <cstdlib>
20
21 // src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
22 // Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
ParseUnicode(uint16_t * buf,size_t buf_size,const char * src,size_t * result_size,size_t * offset)23 void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
24 size_t* offset) {
25 size_t input_ix = 0;
26 size_t output_ix = 0;
27 bool seen_offset = false;
28
29 while (src[input_ix] != 0) {
30 switch (src[input_ix]) {
31 case '\'':
32 // single ASCII char
33 ASSERT_LT(src[input_ix], 0x80);
34 input_ix++;
35 ASSERT_NE(src[input_ix], 0);
36 ASSERT_LT(output_ix, buf_size);
37 buf[output_ix++] = (uint16_t)src[input_ix++];
38 ASSERT_EQ(src[input_ix], '\'');
39 input_ix++;
40 break;
41 case 'u':
42 case 'U': {
43 // Unicode codepoint in hex syntax
44 input_ix++;
45 ASSERT_EQ(src[input_ix], '+');
46 input_ix++;
47 char* endptr = (char*)src + input_ix;
48 unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
49 size_t num_hex_digits = endptr - (src + input_ix);
50 ASSERT_GE(num_hex_digits, 4u); // also triggers on invalid number syntax, digits = 0
51 ASSERT_LE(num_hex_digits, 6u);
52 ASSERT_LE(codepoint, 0x10FFFFu);
53 input_ix += num_hex_digits;
54 if (U16_LENGTH(codepoint) == 1) {
55 ASSERT_LE(output_ix + 1, buf_size);
56 buf[output_ix++] = codepoint;
57 } else {
58 // UTF-16 encoding
59 ASSERT_LE(output_ix + 2, buf_size);
60 buf[output_ix++] = U16_LEAD(codepoint);
61 buf[output_ix++] = U16_TRAIL(codepoint);
62 }
63 break;
64 }
65 case ' ':
66 input_ix++;
67 break;
68 case '|':
69 ASSERT_FALSE(seen_offset);
70 ASSERT_NE(offset, nullptr);
71 *offset = output_ix;
72 seen_offset = true;
73 input_ix++;
74 break;
75 default:
76 FAIL(); // unexpected character
77 }
78 }
79 ASSERT_NE(result_size, nullptr);
80 *result_size = output_ix;
81 ASSERT_TRUE(seen_offset || offset == nullptr);
82 }
83
TEST(UnicodeUtils,parse)84 TEST(UnicodeUtils, parse) {
85 const size_t BUF_SIZE = 256;
86 uint16_t buf[BUF_SIZE];
87 size_t offset;
88 size_t size;
89 ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset);
90 EXPECT_EQ(size, 4u);
91 EXPECT_EQ(offset, 3u);
92 EXPECT_EQ(buf[0], 0x000D);
93 EXPECT_EQ(buf[1], 0xD83D);
94 EXPECT_EQ(buf[2], 0xDC31);
95 EXPECT_EQ(buf[3], 'a');
96 }
97