1 // Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Some UTF character seqeuences in this file were taken from
6 // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
7 
8 #include <gtest/gtest.h>
9 #include <stdio.h>
10 
11 extern "C" {
12 #include "cras_utf8.h"
13 }
14 
15 namespace {
16 
TEST(UTF8,ValidStress)17 TEST(UTF8, ValidStress) {
18   size_t pos;
19 
20   EXPECT_EQ(1, valid_utf8_string("The greek word 'kosme': "
21                                  "\xce\xba\xe1\xbd\xb9\xcf\x83\xce"
22                                  "\xbc\xce\xb5",
23                                  &pos));
24   EXPECT_EQ(35, pos);
25 
26   EXPECT_EQ(1, valid_utf8_string("Playback", &pos));
27   EXPECT_EQ(8, pos);
28 
29   EXPECT_EQ(1, valid_utf8_string("The Euro sign: \xe2\x82\xac", &pos));
30   EXPECT_EQ(18, pos);
31 
32   /* First possible sequence of a certain length. */
33   EXPECT_EQ(1, valid_utf8_string("\x01", &pos));
34   EXPECT_EQ(1, pos);
35   EXPECT_EQ(1, valid_utf8_string("\xc2\x80", &pos));
36   EXPECT_EQ(2, pos);
37   EXPECT_EQ(1, valid_utf8_string("\xe0\xa0\x80", &pos));
38   EXPECT_EQ(3, pos);
39   EXPECT_EQ(1, valid_utf8_string("\xe1\x80\x80", &pos));
40   EXPECT_EQ(3, pos);
41   EXPECT_EQ(1, valid_utf8_string("\xf0\x90\x80\x80", &pos));
42   EXPECT_EQ(4, pos);
43   EXPECT_EQ(1, valid_utf8_string("\xf1\x80\x80\x80", &pos));
44   EXPECT_EQ(4, pos);
45 
46   /* Last possible sequence of a certain length. */
47   EXPECT_EQ(1, valid_utf8_string("\x7f", &pos));
48   EXPECT_EQ(1, pos);
49   EXPECT_EQ(1, valid_utf8_string("\xdf\xbf", &pos));
50   EXPECT_EQ(2, pos);
51   EXPECT_EQ(1, valid_utf8_string("\xef\xbf\xbf", &pos));
52   EXPECT_EQ(3, pos);
53   EXPECT_EQ(1, valid_utf8_string("\xf4\x8f\xbf\xbf", &pos));
54   EXPECT_EQ(4, pos);
55 
56   /* Other boundary conditions. */
57   EXPECT_EQ(1, valid_utf8_string("\xed\x9f\xbf", &pos));
58   EXPECT_EQ(3, pos);
59   EXPECT_EQ(1, valid_utf8_string("\xee\x80\x80", &pos));
60   EXPECT_EQ(3, pos);
61   EXPECT_EQ(1, valid_utf8_string("\xef\xbf\xbd", &pos));
62   EXPECT_EQ(3, pos);
63   EXPECT_EQ(1, valid_utf8_string("\xf0\xbf\xbf\xbf", &pos));
64   EXPECT_EQ(4, pos);
65 
66   /* BOM sequence. */
67   EXPECT_EQ(1, valid_utf8_string("\xef\xbb\xbf", &pos));
68   EXPECT_EQ(3, pos);
69 
70   /* Valid UTF-8 that shouldn't appear in text; chose to allow
71    * these characters anyway. */
72   EXPECT_EQ(1, valid_utf8_string("U+FFFE: \xef\xbf\xbe", &pos));
73   EXPECT_EQ(11, pos);
74   EXPECT_EQ(1, valid_utf8_string("U+FDD0: \xef\xb7\x90", &pos));
75   EXPECT_EQ(11, pos);
76   EXPECT_EQ(1, valid_utf8_string("\xf0\x9f\xbf\xbe", &pos));
77   EXPECT_EQ(4, pos);
78 }
79 
TEST(UTF8,InvalidStress)80 TEST(UTF8, InvalidStress) {
81   size_t pos;
82 
83   /* Malformed continuation bytes. */
84   EXPECT_EQ(0, valid_utf8_string("\x80", &pos));
85   EXPECT_EQ(0, pos);
86   EXPECT_EQ(0, valid_utf8_string("\xbf", &pos));
87   EXPECT_EQ(0, pos);
88   EXPECT_EQ(0, valid_utf8_string("\x80\xbf", &pos));
89   EXPECT_EQ(0, pos);
90   EXPECT_EQ(0, valid_utf8_string("\xc2\x80\xbf", &pos));
91   EXPECT_EQ(2, pos);
92 
93   /* Lonely start characters. */
94   EXPECT_EQ(0, valid_utf8_string("\xc2 \xc3 \xc4 ", &pos));
95   EXPECT_EQ(1, pos);
96 
97   /* Out of range cases. */
98   EXPECT_EQ(0, valid_utf8_string("\xf4\x90\xbf\xbf", &pos));
99   EXPECT_EQ(1, pos);
100   EXPECT_EQ(0, valid_utf8_string(" \xf5\x80", &pos));
101   EXPECT_EQ(1, pos);
102   EXPECT_EQ(0, valid_utf8_string(" \xe0\x80\x80", &pos));
103   EXPECT_EQ(2, pos);
104   EXPECT_EQ(0, valid_utf8_string("\xf4\x80\x80\xcf", &pos));
105   EXPECT_EQ(3, pos);
106 
107   /* Stop in mid-sequence. */
108   EXPECT_EQ(0, valid_utf8_string("\xf4\x80", &pos));
109   EXPECT_EQ(2, pos);
110 
111   /* Bad characters. */
112   EXPECT_EQ(0, valid_utf8_string("\xff", &pos));
113   EXPECT_EQ(0, pos);
114   EXPECT_EQ(0, valid_utf8_string("\xfe", &pos));
115   EXPECT_EQ(0, pos);
116 
117   /* Overlong representations of ASCII characters. */
118   EXPECT_EQ(0, valid_utf8_string("This represents the / character with too"
119                                  "many bytes: \xe0\x80\xaf",
120                                  &pos));
121   EXPECT_EQ(53, pos);
122   EXPECT_EQ(0, valid_utf8_string("This represents the / character with too"
123                                  "many bytes: \xf0\x80\x80\xaf",
124                                  &pos));
125   EXPECT_EQ(53, pos);
126 
127   /* Should not be interpreted as the ASCII NUL character. */
128   EXPECT_EQ(0, valid_utf8_string("This represents the NUL character with too"
129                                  "many bytes: \xe0\x80\x80",
130                                  &pos));
131   EXPECT_EQ(55, pos);
132   EXPECT_EQ(0, valid_utf8_string("This represents the NUL character with too"
133                                  "many bytes: \xf0\x80\x80\x80",
134                                  &pos));
135   EXPECT_EQ(55, pos);
136 
137   /* Single UTF-16 surrogates. */
138   EXPECT_EQ(0, valid_utf8_string("\xed\xa0\x80", &pos));
139   EXPECT_EQ(1, pos);
140   EXPECT_EQ(0, valid_utf8_string("\xed\xad\xbf", &pos));
141   EXPECT_EQ(1, pos);
142   EXPECT_EQ(0, valid_utf8_string("\xed\xae\x80", &pos));
143   EXPECT_EQ(1, pos);
144   EXPECT_EQ(0, valid_utf8_string("\xed\xaf\xbf", &pos));
145   EXPECT_EQ(1, pos);
146   EXPECT_EQ(0, valid_utf8_string("\xed\xb0\x80", &pos));
147   EXPECT_EQ(1, pos);
148   EXPECT_EQ(0, valid_utf8_string("\xed\xbe\x80", &pos));
149   EXPECT_EQ(1, pos);
150   EXPECT_EQ(0, valid_utf8_string("\xed\xbf\xbf", &pos));
151   EXPECT_EQ(1, pos);
152 }
153 
154 }  //  namespace
155 
main(int argc,char ** argv)156 int main(int argc, char** argv) {
157   ::testing::InitGoogleTest(&argc, argv);
158   return RUN_ALL_TESTS();
159 }
160