1 // Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Some UTF character seqeuences in this file were taken from
6 // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
7 
8 #include <gtest/gtest.h>
9 #include <stdio.h>
10 
11 extern "C" {
12 #include "cras_utf8.h"
13 }
14 
15 namespace {
16 
TEST(UTF8,ValidStress)17 TEST(UTF8, ValidStress) {
18   size_t pos;
19 
20   EXPECT_EQ(1, valid_utf8_string("The greek word 'kosme': "
21                                  "\xce\xba\xe1\xbd\xb9\xcf\x83\xce"
22                                  "\xbc\xce\xb5", &pos));
23   EXPECT_EQ(35, pos);
24 
25   EXPECT_EQ(1, valid_utf8_string("Playback", &pos));
26   EXPECT_EQ(8, pos);
27 
28   EXPECT_EQ(1, valid_utf8_string("The Euro sign: \xe2\x82\xac", &pos));
29   EXPECT_EQ(18, pos);
30 
31   /* First possible sequence of a certain length. */
32   EXPECT_EQ(1, valid_utf8_string("\x01", &pos));
33   EXPECT_EQ(1, pos);
34   EXPECT_EQ(1, valid_utf8_string("\xc2\x80", &pos));
35   EXPECT_EQ(2, pos);
36   EXPECT_EQ(1, valid_utf8_string("\xe0\xa0\x80", &pos));
37   EXPECT_EQ(3, pos);
38   EXPECT_EQ(1, valid_utf8_string("\xe1\x80\x80", &pos));
39   EXPECT_EQ(3, pos);
40   EXPECT_EQ(1, valid_utf8_string("\xf0\x90\x80\x80", &pos));
41   EXPECT_EQ(4, pos);
42   EXPECT_EQ(1, valid_utf8_string("\xf1\x80\x80\x80", &pos));
43   EXPECT_EQ(4, pos);
44 
45   /* Last possible sequence of a certain length. */
46   EXPECT_EQ(1, valid_utf8_string("\x7f", &pos));
47   EXPECT_EQ(1, pos);
48   EXPECT_EQ(1, valid_utf8_string("\xdf\xbf", &pos));
49   EXPECT_EQ(2, pos);
50   EXPECT_EQ(1, valid_utf8_string("\xef\xbf\xbf", &pos));
51   EXPECT_EQ(3, pos);
52   EXPECT_EQ(1, valid_utf8_string("\xf4\x8f\xbf\xbf", &pos));
53   EXPECT_EQ(4, pos);
54 
55   /* Other boundary conditions. */
56   EXPECT_EQ(1, valid_utf8_string("\xed\x9f\xbf", &pos));
57   EXPECT_EQ(3, pos);
58   EXPECT_EQ(1, valid_utf8_string("\xee\x80\x80", &pos));
59   EXPECT_EQ(3, pos);
60   EXPECT_EQ(1, valid_utf8_string("\xef\xbf\xbd", &pos));
61   EXPECT_EQ(3, pos);
62   EXPECT_EQ(1, valid_utf8_string("\xf0\xbf\xbf\xbf", &pos));
63   EXPECT_EQ(4, pos);
64 
65   /* BOM sequence. */
66   EXPECT_EQ(1, valid_utf8_string("\xef\xbb\xbf", &pos));
67   EXPECT_EQ(3, pos);
68 
69   /* Valid UTF-8 that shouldn't appear in text; chose to allow
70    * these characters anyway. */
71   EXPECT_EQ(1, valid_utf8_string("U+FFFE: \xef\xbf\xbe", &pos));
72   EXPECT_EQ(11, pos);
73   EXPECT_EQ(1, valid_utf8_string("U+FDD0: \xef\xb7\x90", &pos));
74   EXPECT_EQ(11, pos);
75   EXPECT_EQ(1, valid_utf8_string("\xf0\x9f\xbf\xbe", &pos));
76   EXPECT_EQ(4, pos);
77 }
78 
TEST(UTF8,InvalidStress)79 TEST(UTF8, InvalidStress) {
80   size_t pos;
81 
82   /* Malformed continuation bytes. */
83   EXPECT_EQ(0, valid_utf8_string("\x80", &pos));
84   EXPECT_EQ(0, pos);
85   EXPECT_EQ(0, valid_utf8_string("\xbf", &pos));
86   EXPECT_EQ(0, pos);
87   EXPECT_EQ(0, valid_utf8_string("\x80\xbf", &pos));
88   EXPECT_EQ(0, pos);
89   EXPECT_EQ(0, valid_utf8_string("\xc2\x80\xbf", &pos));
90   EXPECT_EQ(2, pos);
91 
92   /* Lonely start characters. */
93   EXPECT_EQ(0, valid_utf8_string("\xc2 \xc3 \xc4 ", &pos));
94   EXPECT_EQ(1, pos);
95 
96   /* Out of range cases. */
97   EXPECT_EQ(0, valid_utf8_string("\xf4\x90\xbf\xbf", &pos));
98   EXPECT_EQ(1, pos);
99   EXPECT_EQ(0, valid_utf8_string(" \xf5\x80", &pos));
100   EXPECT_EQ(1, pos);
101   EXPECT_EQ(0, valid_utf8_string(" \xe0\x80\x80", &pos));
102   EXPECT_EQ(2, pos);
103   EXPECT_EQ(0, valid_utf8_string("\xf4\x80\x80\xcf", &pos));
104   EXPECT_EQ(3, pos);
105 
106   /* Stop in mid-sequence. */
107   EXPECT_EQ(0, valid_utf8_string("\xf4\x80", &pos));
108   EXPECT_EQ(2, pos);
109 
110   /* Bad characters. */
111   EXPECT_EQ(0, valid_utf8_string("\xff", &pos));
112   EXPECT_EQ(0, pos);
113   EXPECT_EQ(0, valid_utf8_string("\xfe", &pos));
114   EXPECT_EQ(0, pos);
115 
116   /* Overlong representations of ASCII characters. */
117   EXPECT_EQ(0, valid_utf8_string("This represents the / character with too"
118                                  "many bytes: \xe0\x80\xaf", &pos));
119   EXPECT_EQ(53, pos);
120   EXPECT_EQ(0, valid_utf8_string("This represents the / character with too"
121                                  "many bytes: \xf0\x80\x80\xaf", &pos));
122   EXPECT_EQ(53, pos);
123 
124   /* Should not be interpreted as the ASCII NUL character. */
125   EXPECT_EQ(0, valid_utf8_string("This represents the NUL character with too"
126                                  "many bytes: \xe0\x80\x80", &pos));
127   EXPECT_EQ(55, pos);
128   EXPECT_EQ(0, valid_utf8_string("This represents the NUL character with too"
129                                  "many bytes: \xf0\x80\x80\x80", &pos));
130   EXPECT_EQ(55, pos);
131 
132   /* Single UTF-16 surrogates. */
133   EXPECT_EQ(0, valid_utf8_string("\xed\xa0\x80", &pos));
134   EXPECT_EQ(1, pos);
135   EXPECT_EQ(0, valid_utf8_string("\xed\xad\xbf", &pos));
136   EXPECT_EQ(1, pos);
137   EXPECT_EQ(0, valid_utf8_string("\xed\xae\x80", &pos));
138   EXPECT_EQ(1, pos);
139   EXPECT_EQ(0, valid_utf8_string("\xed\xaf\xbf", &pos));
140   EXPECT_EQ(1, pos);
141   EXPECT_EQ(0, valid_utf8_string("\xed\xb0\x80", &pos));
142   EXPECT_EQ(1, pos);
143   EXPECT_EQ(0, valid_utf8_string("\xed\xbe\x80", &pos));
144   EXPECT_EQ(1, pos);
145   EXPECT_EQ(0, valid_utf8_string("\xed\xbf\xbf", &pos));
146   EXPECT_EQ(1, pos);
147 }
148 
149 }  //  namespace
150 
main(int argc,char ** argv)151 int main(int argc, char **argv) {
152   ::testing::InitGoogleTest(&argc, argv);
153   return RUN_ALL_TESTS();
154 }
155