1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <string>
8 #include <vector>
9 #include "util/test.h"
10 #include "re2/regexp.h"
11 #include "re2/prog.h"
12 
13 DEFINE_string(show, "", "regular expression to compile and dump");
14 
15 namespace re2 {
16 
17 // Simple input/output tests checking that
18 // the regexp compiles to the expected code.
19 // These are just to sanity check the basic implementation.
20 // The real confidence tests happen by testing the NFA/DFA
21 // that run the compiled code.
22 
23 struct Test {
24   const char* regexp;
25   const char* code;
26 };
27 
28 static Test tests[] = {
29   { "a",
30     "1. byte [61-61] -> 2\n"
31     "2. match! 0\n" },
32   { "ab",
33     "1. byte [61-61] -> 2\n"
34     "2. byte [62-62] -> 3\n"
35     "3. match! 0\n" },
36   { "a|c",
37     "3. alt -> 1 | 2\n"
38     "1. byte [61-61] -> 4\n"
39     "2. byte [63-63] -> 4\n"
40     "4. match! 0\n" },
41   { "a|b",
42     "1. byte [61-62] -> 2\n"
43     "2. match! 0\n" },
44   { "[ab]",
45     "1. byte [61-62] -> 2\n"
46     "2. match! 0\n" },
47   { "a+",
48     "1. byte [61-61] -> 2\n"
49     "2. alt -> 1 | 3\n"
50     "3. match! 0\n" },
51   { "a+?",
52     "1. byte [61-61] -> 2\n"
53     "2. alt -> 3 | 1\n"
54     "3. match! 0\n" },
55   { "a*",
56     "2. alt -> 1 | 3\n"
57     "1. byte [61-61] -> 2\n"
58     "3. match! 0\n" },
59   { "a*?",
60     "2. alt -> 3 | 1\n"
61     "3. match! 0\n"
62     "1. byte [61-61] -> 2\n" },
63   { "a?",
64     "2. alt -> 1 | 3\n"
65     "1. byte [61-61] -> 3\n"
66     "3. match! 0\n" },
67   { "a??",
68     "2. alt -> 3 | 1\n"
69     "3. match! 0\n"
70     "1. byte [61-61] -> 3\n" },
71   { "a{4}",
72     "1. byte [61-61] -> 2\n"
73     "2. byte [61-61] -> 3\n"
74     "3. byte [61-61] -> 4\n"
75     "4. byte [61-61] -> 5\n"
76     "5. match! 0\n" },
77   { "(a)",
78     "2. capture 2 -> 1\n"
79     "1. byte [61-61] -> 3\n"
80     "3. capture 3 -> 4\n"
81     "4. match! 0\n" },
82   { "(?:a)",
83     "1. byte [61-61] -> 2\n"
84     "2. match! 0\n" },
85   { "",
86     "2. match! 0\n" },
87   { ".",
88     "3. alt -> 1 | 2\n"
89     "1. byte [00-09] -> 4\n"
90     "2. byte [0b-ff] -> 4\n"
91     "4. match! 0\n" },
92   { "[^ab]",
93     "5. alt -> 3 | 4\n"
94     "3. alt -> 1 | 2\n"
95     "4. byte [63-ff] -> 6\n"
96     "1. byte [00-09] -> 6\n"
97     "2. byte [0b-60] -> 6\n"
98     "6. match! 0\n" },
99   { "[Aa]",
100     "1. byte/i [61-61] -> 2\n"
101     "2. match! 0\n" },
102 };
103 
TEST(TestRegexpCompileToProg,Simple)104 TEST(TestRegexpCompileToProg, Simple) {
105   int failed = 0;
106   for (int i = 0; i < arraysize(tests); i++) {
107     const re2::Test& t = tests[i];
108     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
109     if (re == NULL) {
110       LOG(ERROR) << "Cannot parse: " << t.regexp;
111       failed++;
112       continue;
113     }
114     Prog* prog = re->CompileToProg(0);
115     if (prog == NULL) {
116       LOG(ERROR) << "Cannot compile: " << t.regexp;
117       re->Decref();
118       failed++;
119       continue;
120     }
121     CHECK(re->CompileToProg(1) == NULL);
122     string s = prog->Dump();
123     if (s != t.code) {
124       LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
125       LOG(ERROR) << "Want:\n" << t.code;
126       LOG(ERROR) << "Got:\n" << s;
127       failed++;
128     }
129     delete prog;
130     re->Decref();
131   }
132   EXPECT_EQ(failed, 0);
133 }
134 
135 // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
136 // Once, erroneously split between 0x3f and 0x40 because it is
137 // a 6-bit boundary.
138 static struct UTF8ByteRange {
139   int lo;
140   int hi;
141 } utf8ranges[] = {
142   { 0x00, 0x09 },
143   { 0x0A, 0x0A },
144   { 0x10, 0x7F },
145   { 0x80, 0x8F },
146   { 0x90, 0x9F },
147   { 0xA0, 0xBF },
148   { 0xC0, 0xC1 },
149   { 0xC2, 0xDF },
150   { 0xE0, 0xE0 },
151   { 0xE1, 0xEF },
152   { 0xF0, 0xF0 },
153   { 0xF1, 0xF3 },
154   { 0xF4, 0xF4 },
155   { 0xF5, 0xFF },
156 };
157 
TEST(TestCompile,ByteRanges)158 TEST(TestCompile, ByteRanges) {
159   Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
160   EXPECT_TRUE(re != NULL);
161   Prog* prog = re->CompileToProg(0);
162   EXPECT_TRUE(prog != NULL);
163   EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
164   for (int i = 0; i < arraysize(utf8ranges); i++)
165     for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
166       EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
167   delete prog;
168   re->Decref();
169 }
170 
171 }  // namespace re2
172