1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <gtest/gtest.h>
18 #include <unicode/utf.h>
19 #include <cstdlib>
20 
21 // src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
22 // Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
ParseUnicode(uint16_t * buf,size_t buf_size,const char * src,size_t * result_size,size_t * offset)23 void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
24         size_t* offset) {
25     size_t input_ix = 0;
26     size_t output_ix = 0;
27     bool seen_offset = false;
28 
29     while (src[input_ix] != 0) {
30         switch (src[input_ix]) {
31         case '\'':
32             // single ASCII char
33             ASSERT_LT(src[input_ix], 0x80);
34             input_ix++;
35             ASSERT_NE(src[input_ix], 0);
36             ASSERT_LT(output_ix, buf_size);
37             buf[output_ix++] = (uint16_t)src[input_ix++];
38             ASSERT_EQ(src[input_ix], '\'');
39             input_ix++;
40             break;
41         case 'u':
42         case 'U': {
43             // Unicode codepoint in hex syntax
44             input_ix++;
45             ASSERT_EQ(src[input_ix], '+');
46             input_ix++;
47             char* endptr = (char*)src + input_ix;
48             unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
49             size_t num_hex_digits = endptr - (src + input_ix);
50             ASSERT_GE(num_hex_digits, 4u);  // also triggers on invalid number syntax, digits = 0
51             ASSERT_LE(num_hex_digits, 6u);
52             ASSERT_LE(codepoint, 0x10FFFFu);
53             input_ix += num_hex_digits;
54             if (U16_LENGTH(codepoint) == 1) {
55                 ASSERT_LE(output_ix + 1, buf_size);
56                 buf[output_ix++] = codepoint;
57             } else {
58                 // UTF-16 encoding
59                 ASSERT_LE(output_ix + 2, buf_size);
60                 buf[output_ix++] = U16_LEAD(codepoint);
61                 buf[output_ix++] = U16_TRAIL(codepoint);
62             }
63             break;
64         }
65         case ' ':
66             input_ix++;
67             break;
68         case '|':
69             ASSERT_FALSE(seen_offset);
70             ASSERT_NE(offset, nullptr);
71             *offset = output_ix;
72             seen_offset = true;
73             input_ix++;
74             break;
75         default:
76             FAIL();  // unexpected character
77         }
78     }
79     ASSERT_NE(result_size, nullptr);
80     *result_size = output_ix;
81     ASSERT_TRUE(seen_offset || offset == nullptr);
82 }
83 
TEST(UnicodeUtils,parse)84 TEST(UnicodeUtils, parse) {
85     const size_t BUF_SIZE = 256;
86     uint16_t buf[BUF_SIZE];
87     size_t offset;
88     size_t size;
89     ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset);
90     EXPECT_EQ(size, 4u);
91     EXPECT_EQ(offset, 3u);
92     EXPECT_EQ(buf[0], 0x000D);
93     EXPECT_EQ(buf[1], 0xD83D);
94     EXPECT_EQ(buf[2], 0xDC31);
95     EXPECT_EQ(buf[3], 'a');
96 }
97