1 // Copyright 2003-2009 Google Inc.  All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
6 // The main changes are the addition of the HitLimit method and
7 // compilation as PCRE in namespace re2.
8 
9 #include <errno.h>
10 #include "util/util.h"
11 #include "util/flags.h"
12 #include "util/pcre.h"
13 
14 #define PCREPORT(level) LOG(level)
15 
16 // Default PCRE limits.
17 // Defaults chosen to allow a plausible amount of CPU and
18 // not exceed main thread stacks.  Note that other threads
19 // often have smaller stacks, and therefore tightening
20 // regexp_stack_limit may frequently be necessary.
21 DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)");
22 DEFINE_int32(regexp_match_limit, 1000000,
23              "default PCRE match limit (function calls)");
24 
25 namespace re2 {
26 
27 // Maximum number of args we can set
28 static const int kMaxArgs = 16;
29 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
30 
31 // Approximate size of a recursive invocation of PCRE's
32 // internal "match()" frame.  This varies depending on the
33 // compiler and architecture, of course, so the constant is
34 // just a conservative estimate.  To find the exact number,
35 // run regexp_unittest with --regexp_stack_limit=0 under
36 // a debugger and look at the frames when it crashes.
37 // The exact frame size was 656 in production on 2008/02/03.
38 static const int kPCREFrameSize = 700;
39 
40 // Special name for missing C++ arguments.
41 PCRE::Arg PCRE::no_more_args((void*)NULL);
42 
43 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
44 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
45 const PCRE::ConsumeFunctor PCRE::Consume = { };
46 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
47 
48 // If a regular expression has no error, its error_ field points here
49 static const string empty_string;
50 
Init(const char * pattern,Option options,int match_limit,int stack_limit,bool report_errors)51 void PCRE::Init(const char* pattern, Option options, int match_limit,
52               int stack_limit, bool report_errors) {
53   pattern_ = pattern;
54   options_ = options;
55   match_limit_ = match_limit;
56   stack_limit_ = stack_limit;
57   hit_limit_ = false;
58   error_ = &empty_string;
59   report_errors_ = report_errors;
60   re_full_ = NULL;
61   re_partial_ = NULL;
62 
63   if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
64     error_ = new string("illegal regexp option");
65     PCREPORT(ERROR)
66         << "Error compiling '" << pattern << "': illegal regexp option";
67   } else {
68     re_partial_ = Compile(UNANCHORED);
69     if (re_partial_ != NULL) {
70       re_full_ = Compile(ANCHOR_BOTH);
71     }
72   }
73 }
74 
PCRE(const char * pattern)75 PCRE::PCRE(const char* pattern) {
76   Init(pattern, None, 0, 0, true);
77 }
PCRE(const char * pattern,Option option)78 PCRE::PCRE(const char* pattern, Option option) {
79   Init(pattern, option, 0, 0, true);
80 }
PCRE(const string & pattern)81 PCRE::PCRE(const string& pattern) {
82   Init(pattern.c_str(), None, 0, 0, true);
83 }
PCRE(const string & pattern,Option option)84 PCRE::PCRE(const string& pattern, Option option) {
85   Init(pattern.c_str(), option, 0, 0, true);
86 }
PCRE(const string & pattern,const PCRE_Options & re_option)87 PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) {
88   Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
89        re_option.stack_limit(), re_option.report_errors());
90 }
91 
PCRE(const char * pattern,const PCRE_Options & re_option)92 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
93   Init(pattern, re_option.option(), re_option.match_limit(),
94        re_option.stack_limit(), re_option.report_errors());
95 }
96 
~PCRE()97 PCRE::~PCRE() {
98   if (re_full_ != NULL)         pcre_free(re_full_);
99   if (re_partial_ != NULL)      pcre_free(re_partial_);
100   if (error_ != &empty_string)  delete error_;
101 }
102 
Compile(Anchor anchor)103 pcre* PCRE::Compile(Anchor anchor) {
104   // Special treatment for anchoring.  This is needed because at
105   // runtime pcre only provides an option for anchoring at the
106   // beginning of a string.
107   //
108   // There are three types of anchoring we want:
109   //    UNANCHORED      Compile the original pattern, and use
110   //                    a pcre unanchored match.
111   //    ANCHOR_START    Compile the original pattern, and use
112   //                    a pcre anchored match.
113   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
114   //                    and use a pcre anchored match.
115 
116   const char* error;
117   int eoffset;
118   pcre* re;
119   if (anchor != ANCHOR_BOTH) {
120     re = pcre_compile(pattern_.c_str(),
121                       (options_ & EnabledCompileOptions),
122                       &error, &eoffset, NULL);
123   } else {
124     // Tack a '\z' at the end of PCRE.  Parenthesize it first so that
125     // the '\z' applies to all top-level alternatives in the regexp.
126     string wrapped = "(?:";  // A non-counting grouping operator
127     wrapped += pattern_;
128     wrapped += ")\\z";
129     re = pcre_compile(wrapped.c_str(),
130                       (options_ & EnabledCompileOptions),
131                       &error, &eoffset, NULL);
132   }
133   if (re == NULL) {
134     if (error_ == &empty_string) error_ = new string(error);
135     PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
136   }
137   return re;
138 }
139 
140 /***** Convenience interfaces *****/
141 
operator ()(const StringPiece & text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const142 bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
143                                        const PCRE& re,
144                                        const Arg& a0,
145                                        const Arg& a1,
146                                        const Arg& a2,
147                                        const Arg& a3,
148                                        const Arg& a4,
149                                        const Arg& a5,
150                                        const Arg& a6,
151                                        const Arg& a7,
152                                        const Arg& a8,
153                                        const Arg& a9,
154                                        const Arg& a10,
155                                        const Arg& a11,
156                                        const Arg& a12,
157                                        const Arg& a13,
158                                        const Arg& a14,
159                                        const Arg& a15) const {
160   const Arg* args[kMaxArgs];
161   int n = 0;
162   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
163   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
164   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
165   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
166   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
167   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
168   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
169   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
170   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
171   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
172   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
173   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
174   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
175   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
176   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
177   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
178 done:
179 
180   int consumed;
181   int vec[kVecSize];
182   return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
183 }
184 
operator ()(const StringPiece & text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const185 bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
186                                           const PCRE& re,
187                                           const Arg& a0,
188                                           const Arg& a1,
189                                           const Arg& a2,
190                                           const Arg& a3,
191                                           const Arg& a4,
192                                           const Arg& a5,
193                                           const Arg& a6,
194                                           const Arg& a7,
195                                           const Arg& a8,
196                                           const Arg& a9,
197                                           const Arg& a10,
198                                           const Arg& a11,
199                                           const Arg& a12,
200                                           const Arg& a13,
201                                           const Arg& a14,
202                                           const Arg& a15) const {
203   const Arg* args[kMaxArgs];
204   int n = 0;
205   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
206   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
207   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
208   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
209   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
210   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
211   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
212   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
213   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
214   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
215   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
216   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
217   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
218   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
219   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
220   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
221 done:
222 
223   int consumed;
224   int vec[kVecSize];
225   return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
226 }
227 
operator ()(StringPiece * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const228 bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
229                                      const PCRE& pattern,
230                                      const Arg& a0,
231                                      const Arg& a1,
232                                      const Arg& a2,
233                                      const Arg& a3,
234                                      const Arg& a4,
235                                      const Arg& a5,
236                                      const Arg& a6,
237                                      const Arg& a7,
238                                      const Arg& a8,
239                                      const Arg& a9,
240                                      const Arg& a10,
241                                      const Arg& a11,
242                                      const Arg& a12,
243                                      const Arg& a13,
244                                      const Arg& a14,
245                                      const Arg& a15) const {
246   const Arg* args[kMaxArgs];
247   int n = 0;
248   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
249   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
250   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
251   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
252   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
253   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
254   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
255   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
256   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
257   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
258   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
259   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
260   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
261   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
262   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
263   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
264 done:
265 
266   int consumed;
267   int vec[kVecSize];
268   if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
269                           args, n, vec, kVecSize)) {
270     input->remove_prefix(consumed);
271     return true;
272   } else {
273     return false;
274   }
275 }
276 
operator ()(StringPiece * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const277 bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
278                                             const PCRE& pattern,
279                                             const Arg& a0,
280                                             const Arg& a1,
281                                             const Arg& a2,
282                                             const Arg& a3,
283                                             const Arg& a4,
284                                             const Arg& a5,
285                                             const Arg& a6,
286                                             const Arg& a7,
287                                             const Arg& a8,
288                                             const Arg& a9,
289                                             const Arg& a10,
290                                             const Arg& a11,
291                                             const Arg& a12,
292                                             const Arg& a13,
293                                             const Arg& a14,
294                                             const Arg& a15) const {
295   const Arg* args[kMaxArgs];
296   int n = 0;
297   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
298   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
299   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
300   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
301   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
302   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
303   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
304   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
305   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
306   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
307   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
308   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
309   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
310   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
311   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
312   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
313 done:
314 
315   int consumed;
316   int vec[kVecSize];
317   if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
318                           args, n, vec, kVecSize)) {
319     input->remove_prefix(consumed);
320     return true;
321   } else {
322     return false;
323   }
324 }
325 
Replace(string * str,const PCRE & pattern,const StringPiece & rewrite)326 bool PCRE::Replace(string *str,
327                  const PCRE& pattern,
328                  const StringPiece& rewrite) {
329   int vec[kVecSize];
330   int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
331   if (matches == 0)
332     return false;
333 
334   string s;
335   if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
336     return false;
337 
338   assert(vec[0] >= 0);
339   assert(vec[1] >= 0);
340   str->replace(vec[0], vec[1] - vec[0], s);
341   return true;
342 }
343 
GlobalReplace(string * str,const PCRE & pattern,const StringPiece & rewrite)344 int PCRE::GlobalReplace(string *str,
345                       const PCRE& pattern,
346                       const StringPiece& rewrite) {
347   int count = 0;
348   int vec[kVecSize];
349   string out;
350   int start = 0;
351   bool last_match_was_empty_string = false;
352 
353   for (; start <= str->length();) {
354     // If the previous match was for the empty string, we shouldn't
355     // just match again: we'll match in the same way and get an
356     // infinite loop.  Instead, we do the match in a special way:
357     // anchored -- to force another try at the same position --
358     // and with a flag saying that this time, ignore empty matches.
359     // If this special match returns, that means there's a non-empty
360     // match at this position as well, and we can continue.  If not,
361     // we do what perl does, and just advance by one.
362     // Notice that perl prints '@@@' for this;
363     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
364     int matches;
365     if (last_match_was_empty_string) {
366       matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
367                                  vec, kVecSize);
368       if (matches <= 0) {
369         if (start < str->length())
370           out.push_back((*str)[start]);
371         start++;
372         last_match_was_empty_string = false;
373         continue;
374       }
375     } else {
376       matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
377       if (matches <= 0)
378         break;
379     }
380     int matchstart = vec[0], matchend = vec[1];
381     assert(matchstart >= start);
382     assert(matchend >= matchstart);
383 
384     out.append(*str, start, matchstart - start);
385     pattern.Rewrite(&out, rewrite, *str, vec, matches);
386     start = matchend;
387     count++;
388     last_match_was_empty_string = (matchstart == matchend);
389   }
390 
391   if (count == 0)
392     return 0;
393 
394   if (start < str->length())
395     out.append(*str, start, str->length() - start);
396   swap(out, *str);
397   return count;
398 }
399 
Extract(const StringPiece & text,const PCRE & pattern,const StringPiece & rewrite,string * out)400 bool PCRE::Extract(const StringPiece &text,
401                  const PCRE& pattern,
402                  const StringPiece &rewrite,
403                  string *out) {
404   int vec[kVecSize];
405   int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
406   if (matches == 0)
407     return false;
408   out->clear();
409   return pattern.Rewrite(out, rewrite, text, vec, matches);
410 }
411 
QuoteMeta(const StringPiece & unquoted)412 string PCRE::QuoteMeta(const StringPiece& unquoted) {
413   string result;
414   result.reserve(unquoted.size() << 1);
415 
416   // Escape any ascii character not in [A-Za-z_0-9].
417   //
418   // Note that it's legal to escape a character even if it has no
419   // special meaning in a regular expression -- so this function does
420   // that.  (This also makes it identical to the perl function of the
421   // same name except for the null-character special case;
422   // see `perldoc -f quotemeta`.)
423   for (int ii = 0; ii < unquoted.length(); ++ii) {
424     // Note that using 'isalnum' here raises the benchmark time from
425     // 32ns to 58ns:
426     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
427         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
428         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
429         unquoted[ii] != '_' &&
430         // If this is the part of a UTF8 or Latin1 character, we need
431         // to copy this byte without escaping.  Experimentally this is
432         // what works correctly with the regexp library.
433         !(unquoted[ii] & 128)) {
434       if (unquoted[ii] == '\0') {  // Special handling for null chars.
435         // Can't use "\\0" since the next character might be a digit.
436         result += "\\x00";
437         continue;
438       }
439       result += '\\';
440     }
441     result += unquoted[ii];
442   }
443 
444   return result;
445 }
446 
447 /***** Actual matching and rewriting code *****/
448 
HitLimit()449 bool PCRE::HitLimit() {
450   return hit_limit_;
451 }
452 
ClearHitLimit()453 void PCRE::ClearHitLimit() {
454   hit_limit_ = 0;
455 }
456 
TryMatch(const StringPiece & text,int startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const457 int PCRE::TryMatch(const StringPiece& text,
458                  int startpos,
459                  Anchor anchor,
460                  bool empty_ok,
461                  int *vec,
462                  int vecsize) const {
463   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
464   if (re == NULL) {
465     PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
466     return 0;
467   }
468 
469   int match_limit = match_limit_;
470   if (match_limit <= 0) {
471     match_limit = FLAGS_regexp_match_limit;
472   }
473 
474   int stack_limit = stack_limit_;
475   if (stack_limit <= 0) {
476     stack_limit = FLAGS_regexp_stack_limit;
477   }
478 
479   pcre_extra extra = { 0 };
480   if (match_limit > 0) {
481     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
482     extra.match_limit = match_limit;
483   }
484   if (stack_limit > 0) {
485     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
486     extra.match_limit_recursion = stack_limit / kPCREFrameSize;
487   }
488 
489   int options = 0;
490   if (anchor != UNANCHORED)
491     options |= PCRE_ANCHORED;
492   if (!empty_ok)
493     options |= PCRE_NOTEMPTY;
494 
495   int rc = pcre_exec(re,              // The regular expression object
496                      &extra,
497                      (text.data() == NULL) ? "" : text.data(),
498                      text.size(),
499                      startpos,
500                      options,
501                      vec,
502                      vecsize);
503 
504   // Handle errors
505   if (rc == 0) {
506     // pcre_exec() returns 0 as a special case when the number of
507     // capturing subpatterns exceeds the size of the vector.
508     // When this happens, there is a match and the output vector
509     // is filled, but we miss out on the positions of the extra subpatterns.
510     rc = vecsize / 2;
511   } else if (rc < 0) {
512     switch (rc) {
513       case PCRE_ERROR_NOMATCH:
514         return 0;
515       case PCRE_ERROR_MATCHLIMIT:
516         // Writing to hit_limit is not safe if multiple threads
517         // are using the PCRE, but the flag is only intended
518         // for use by unit tests anyway, so we let it go.
519         hit_limit_ = true;
520         PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
521                         << " when matching '" << pattern_ << "'"
522                         << " against text that is " << text.size() << " bytes.";
523         return 0;
524       case PCRE_ERROR_RECURSIONLIMIT:
525         // See comment about hit_limit above.
526         hit_limit_ = true;
527         PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
528                         << " when matching '" << pattern_ << "'"
529                         << " against text that is " << text.size() << " bytes.";
530         return 0;
531       default:
532         // There are other return codes from pcre.h :
533         // PCRE_ERROR_NULL           (-2)
534         // PCRE_ERROR_BADOPTION      (-3)
535         // PCRE_ERROR_BADMAGIC       (-4)
536         // PCRE_ERROR_UNKNOWN_NODE   (-5)
537         // PCRE_ERROR_NOMEMORY       (-6)
538         // PCRE_ERROR_NOSUBSTRING    (-7)
539         // ...
540         PCREPORT(ERROR) << "Unexpected return code: " << rc
541                       << " when matching '" << pattern_ << "'"
542                       << ", re=" << re
543                       << ", text=" << text
544                       << ", vec=" << vec
545                       << ", vecsize=" << vecsize;
546         return 0;
547     }
548   }
549 
550   return rc;
551 }
552 
DoMatchImpl(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const * args,int n,int * vec,int vecsize) const553 bool PCRE::DoMatchImpl(const StringPiece& text,
554                      Anchor anchor,
555                      int* consumed,
556                      const Arg* const* args,
557                      int n,
558                      int* vec,
559                      int vecsize) const {
560   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
561   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
562   assert(matches >= 0);  // TryMatch never returns negatives
563   if (matches == 0)
564     return false;
565 
566   *consumed = vec[1];
567 
568   if (n == 0 || args == NULL) {
569     // We are not interested in results
570     return true;
571   }
572   if (NumberOfCapturingGroups() < n) {
573     // PCRE has fewer capturing groups than number of arg pointers passed in
574     return false;
575   }
576 
577   // If we got here, we must have matched the whole pattern.
578   // We do not need (can not do) any more checks on the value of 'matches' here
579   // -- see the comment for TryMatch.
580   for (int i = 0; i < n; i++) {
581     const int start = vec[2*(i+1)];
582     const int limit = vec[2*(i+1)+1];
583     if (!args[i]->Parse(text.data() + start, limit-start)) {
584       // TODO: Should we indicate what the error was?
585       return false;
586     }
587   }
588 
589   return true;
590 }
591 
DoMatch(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const args[],int n) const592 bool PCRE::DoMatch(const StringPiece& text,
593                  Anchor anchor,
594                  int* consumed,
595                  const Arg* const args[],
596                  int n) const {
597   assert(n >= 0);
598   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
599                                        // (as for kVecSize)
600   int *vec = new int[vecsize];
601   bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
602   delete[] vec;
603   return b;
604 }
605 
Rewrite(string * out,const StringPiece & rewrite,const StringPiece & text,int * vec,int veclen) const606 bool PCRE::Rewrite(string *out, const StringPiece &rewrite,
607                  const StringPiece &text, int *vec, int veclen) const {
608   int number_of_capturing_groups = NumberOfCapturingGroups();
609   for (const char *s = rewrite.data(), *end = s + rewrite.size();
610        s < end; s++) {
611     int c = *s;
612     if (c == '\\') {
613       c = *++s;
614       if (isdigit(c)) {
615         int n = (c - '0');
616         if (n >= veclen) {
617           if (n <= number_of_capturing_groups) {
618             // unmatched optional capturing group. treat
619             // its value as empty string; i.e., nothing to append.
620           } else {
621             PCREPORT(ERROR) << "requested group " << n
622                           << " in regexp " << rewrite.data();
623             return false;
624           }
625         }
626         int start = vec[2 * n];
627         if (start >= 0)
628           out->append(text.data() + start, vec[2 * n + 1] - start);
629       } else if (c == '\\') {
630         out->push_back('\\');
631       } else {
632         PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
633         return false;
634       }
635     } else {
636       out->push_back(c);
637     }
638   }
639   return true;
640 }
641 
CheckRewriteString(const StringPiece & rewrite,string * error) const642 bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const {
643   int max_token = -1;
644   for (const char *s = rewrite.data(), *end = s + rewrite.size();
645        s < end; s++) {
646     int c = *s;
647     if (c != '\\') {
648       continue;
649     }
650     if (++s == end) {
651       *error = "Rewrite schema error: '\\' not allowed at end.";
652       return false;
653     }
654     c = *s;
655     if (c == '\\') {
656       continue;
657     }
658     if (!isdigit(c)) {
659       *error = "Rewrite schema error: "
660                "'\\' must be followed by a digit or '\\'.";
661       return false;
662     }
663     int n = (c - '0');
664     if (max_token < n) {
665       max_token = n;
666     }
667   }
668 
669   if (max_token > NumberOfCapturingGroups()) {
670     SStringPrintf(error, "Rewrite schema requests %d matches, "
671                   "but the regexp only has %d parenthesized subexpressions.",
672                   max_token, NumberOfCapturingGroups());
673     return false;
674   }
675   return true;
676 }
677 
678 
679 // Return the number of capturing subpatterns, or -1 if the
680 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const681 int PCRE::NumberOfCapturingGroups() const {
682   if (re_partial_ == NULL) return -1;
683 
684   int result;
685   CHECK(pcre_fullinfo(re_partial_,       // The regular expression object
686                       NULL,              // We did not study the pattern
687                       PCRE_INFO_CAPTURECOUNT,
688                       &result) == 0);
689   return result;
690 }
691 
692 
693 /***** Parsers for various types *****/
694 
parse_null(const char * str,int n,void * dest)695 bool PCRE::Arg::parse_null(const char* str, int n, void* dest) {
696   // We fail if somebody asked us to store into a non-NULL void* pointer
697   return (dest == NULL);
698 }
699 
parse_string(const char * str,int n,void * dest)700 bool PCRE::Arg::parse_string(const char* str, int n, void* dest) {
701   if (dest == NULL) return true;
702   reinterpret_cast<string*>(dest)->assign(str, n);
703   return true;
704 }
705 
parse_stringpiece(const char * str,int n,void * dest)706 bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) {
707   if (dest == NULL) return true;
708   reinterpret_cast<StringPiece*>(dest)->set(str, n);
709   return true;
710 }
711 
parse_char(const char * str,int n,void * dest)712 bool PCRE::Arg::parse_char(const char* str, int n, void* dest) {
713   if (n != 1) return false;
714   if (dest == NULL) return true;
715   *(reinterpret_cast<char*>(dest)) = str[0];
716   return true;
717 }
718 
parse_uchar(const char * str,int n,void * dest)719 bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) {
720   if (n != 1) return false;
721   if (dest == NULL) return true;
722   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
723   return true;
724 }
725 
726 // Largest number spec that we are willing to parse
727 static const int kMaxNumberLength = 32;
728 
729 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
730 // PCREQUIPCRES "n > 0"
731 // Copies "str" into "buf" and null-terminates if necessary.
732 // Returns one of:
733 //      a. "str" if no termination is needed
734 //      b. "buf" if the string was copied and null-terminated
735 //      c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,int n)736 static const char* TerminateNumber(char* buf, const char* str, int n) {
737   if ((n > 0) && isspace(*str)) {
738     // We are less forgiving than the strtoxxx() routines and do not
739     // allow leading spaces.
740     return "";
741   }
742 
743   // See if the character right after the input text may potentially
744   // look like a digit.
745   if (isdigit(str[n]) ||
746       ((str[n] >= 'a') && (str[n] <= 'f')) ||
747       ((str[n] >= 'A') && (str[n] <= 'F'))) {
748     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
749     memcpy(buf, str, n);
750     buf[n] = '\0';
751     return buf;
752   } else {
753     // We can parse right out of the supplied string, so return it.
754     return str;
755   }
756 }
757 
parse_long_radix(const char * str,int n,void * dest,int radix)758 bool PCRE::Arg::parse_long_radix(const char* str,
759                                int n,
760                                void* dest,
761                                int radix) {
762   if (n == 0) return false;
763   char buf[kMaxNumberLength+1];
764   str = TerminateNumber(buf, str, n);
765   char* end;
766   errno = 0;
767   long r = strtol(str, &end, radix);
768   if (end != str + n) return false;   // Leftover junk
769   if (errno) return false;
770   if (dest == NULL) return true;
771   *(reinterpret_cast<long*>(dest)) = r;
772   return true;
773 }
774 
parse_ulong_radix(const char * str,int n,void * dest,int radix)775 bool PCRE::Arg::parse_ulong_radix(const char* str,
776                                 int n,
777                                 void* dest,
778                                 int radix) {
779   if (n == 0) return false;
780   char buf[kMaxNumberLength+1];
781   str = TerminateNumber(buf, str, n);
782   if (str[0] == '-') {
783    // strtoul() will silently accept negative numbers and parse
784    // them.  This module is more strict and treats them as errors.
785    return false;
786   }
787 
788   char* end;
789   errno = 0;
790   unsigned long r = strtoul(str, &end, radix);
791   if (end != str + n) return false;   // Leftover junk
792   if (errno) return false;
793   if (dest == NULL) return true;
794   *(reinterpret_cast<unsigned long*>(dest)) = r;
795   return true;
796 }
797 
parse_short_radix(const char * str,int n,void * dest,int radix)798 bool PCRE::Arg::parse_short_radix(const char* str,
799                                 int n,
800                                 void* dest,
801                                 int radix) {
802   long r;
803   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
804   if ((short)r != r) return false;       // Out of range
805   if (dest == NULL) return true;
806   *(reinterpret_cast<short*>(dest)) = r;
807   return true;
808 }
809 
parse_ushort_radix(const char * str,int n,void * dest,int radix)810 bool PCRE::Arg::parse_ushort_radix(const char* str,
811                                  int n,
812                                  void* dest,
813                                  int radix) {
814   unsigned long r;
815   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
816   if ((ushort)r != r) return false;                      // Out of range
817   if (dest == NULL) return true;
818   *(reinterpret_cast<unsigned short*>(dest)) = r;
819   return true;
820 }
821 
parse_int_radix(const char * str,int n,void * dest,int radix)822 bool PCRE::Arg::parse_int_radix(const char* str,
823                               int n,
824                               void* dest,
825                               int radix) {
826   long r;
827   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
828   if ((int)r != r) return false;         // Out of range
829   if (dest == NULL) return true;
830   *(reinterpret_cast<int*>(dest)) = r;
831   return true;
832 }
833 
parse_uint_radix(const char * str,int n,void * dest,int radix)834 bool PCRE::Arg::parse_uint_radix(const char* str,
835                                int n,
836                                void* dest,
837                                int radix) {
838   unsigned long r;
839   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
840   if ((uint)r != r) return false;                       // Out of range
841   if (dest == NULL) return true;
842   *(reinterpret_cast<unsigned int*>(dest)) = r;
843   return true;
844 }
845 
parse_longlong_radix(const char * str,int n,void * dest,int radix)846 bool PCRE::Arg::parse_longlong_radix(const char* str,
847                                    int n,
848                                    void* dest,
849                                    int radix) {
850   if (n == 0) return false;
851   char buf[kMaxNumberLength+1];
852   str = TerminateNumber(buf, str, n);
853   char* end;
854   errno = 0;
855   int64 r = strtoll(str, &end, radix);
856   if (end != str + n) return false;   // Leftover junk
857   if (errno) return false;
858   if (dest == NULL) return true;
859   *(reinterpret_cast<int64*>(dest)) = r;
860   return true;
861 }
862 
parse_ulonglong_radix(const char * str,int n,void * dest,int radix)863 bool PCRE::Arg::parse_ulonglong_radix(const char* str,
864                                     int n,
865                                     void* dest,
866                                     int radix) {
867   if (n == 0) return false;
868   char buf[kMaxNumberLength+1];
869   str = TerminateNumber(buf, str, n);
870   if (str[0] == '-') {
871     // strtoull() will silently accept negative numbers and parse
872     // them.  This module is more strict and treats them as errors.
873     return false;
874   }
875   char* end;
876   errno = 0;
877   uint64 r = strtoull(str, &end, radix);
878   if (end != str + n) return false;   // Leftover junk
879   if (errno) return false;
880   if (dest == NULL) return true;
881   *(reinterpret_cast<uint64*>(dest)) = r;
882   return true;
883 }
884 
parse_double(const char * str,int n,void * dest)885 bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
886   if (n == 0) return false;
887   static const int kMaxLength = 200;
888   char buf[kMaxLength];
889   if (n >= kMaxLength) return false;
890   memcpy(buf, str, n);
891   buf[n] = '\0';
892   errno = 0;
893   char* end;
894   double r = strtod(buf, &end);
895   if (end != buf + n) {
896 #ifdef COMPILER_MSVC
897     // Microsoft's strtod() doesn't handle inf and nan, so we have to
898     // handle it explicitly.  Speed is not important here because this
899     // code is only called in unit tests.
900     bool pos = true;
901     const char* i = buf;
902     if ('-' == *i) {
903       pos = false;
904       ++i;
905     } else if ('+' == *i) {
906       ++i;
907     }
908     if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) {
909       r = numeric_limits<double>::infinity();
910       if (!pos)
911         r = -r;
912     } else if (0 == stricmp(i, "nan")) {
913       r = numeric_limits<double>::quiet_NaN();
914     } else {
915       return false;
916     }
917 #else
918     return false;   // Leftover junk
919 #endif
920   }
921   if (errno) return false;
922   if (dest == NULL) return true;
923   *(reinterpret_cast<double*>(dest)) = r;
924   return true;
925 }
926 
parse_float(const char * str,int n,void * dest)927 bool PCRE::Arg::parse_float(const char* str, int n, void* dest) {
928   double r;
929   if (!parse_double(str, n, &r)) return false;
930   if (dest == NULL) return true;
931   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
932   return true;
933 }
934 
935 
936 #define DEFINE_INTEGER_PARSERS(name)                                        \
937   bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) {          \
938     return parse_##name##_radix(str, n, dest, 10);                          \
939   }                                                                         \
940   bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
941     return parse_##name##_radix(str, n, dest, 16);                          \
942   }                                                                         \
943   bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
944     return parse_##name##_radix(str, n, dest, 8);                           \
945   }                                                                         \
946   bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
947     return parse_##name##_radix(str, n, dest, 0);                           \
948   }
949 
950 DEFINE_INTEGER_PARSERS(short);
951 DEFINE_INTEGER_PARSERS(ushort);
952 DEFINE_INTEGER_PARSERS(int);
953 DEFINE_INTEGER_PARSERS(uint);
954 DEFINE_INTEGER_PARSERS(long);
955 DEFINE_INTEGER_PARSERS(ulong);
956 DEFINE_INTEGER_PARSERS(longlong);
957 DEFINE_INTEGER_PARSERS(ulonglong);
958 
959 #undef DEFINE_INTEGER_PARSERS
960 
961 }  // namespace re2
962