1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // Author: Sanjay Ghemawat
31 
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35 
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <string.h>      /* for memcpy */
41 #include <assert.h>
42 #include <errno.h>
43 #include <string>
44 #include <algorithm>
45 
46 #include "pcrecpp_internal.h"
47 #include "pcre.h"
48 #include "pcrecpp.h"
49 #include "pcre_stringpiece.h"
50 
51 
52 namespace pcrecpp {
53 
54 // Maximum number of args we can set
55 static const int kMaxArgs = 16;
56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
57 
58 // Special object that stands-in for no argument
59 Arg RE::no_arg((void*)NULL);
60 
61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
62 // which defined a global no_arg variable instead of putting it in the
63 // RE class.  This works on GCC >= 3, at least.  It definitely works
64 // for ELF, but may not for other object formats (Mach-O, for
65 // instance, does not support aliases.)  We could probably have a more
66 // inclusive test if we ever needed it.  (Note that not only the
67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
68 // gnu-specific.)
69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
70 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
71 # define ULP_AS_STRING_INTERNAL(x)   #x
72 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
73 extern Arg no_arg
74   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
75 #endif
76 
77 // If a regular expression has no error, its error_ field points here
78 static const string empty_string;
79 
80 // If the user doesn't ask for any options, we just use this one
81 static RE_Options default_options;
82 
Init(const string & pat,const RE_Options * options)83 void RE::Init(const string& pat, const RE_Options* options) {
84   pattern_ = pat;
85   if (options == NULL) {
86     options_ = default_options;
87   } else {
88     options_ = *options;
89   }
90   error_ = &empty_string;
91   re_full_ = NULL;
92   re_partial_ = NULL;
93 
94   re_partial_ = Compile(UNANCHORED);
95   if (re_partial_ != NULL) {
96     re_full_ = Compile(ANCHOR_BOTH);
97   }
98 }
99 
Cleanup()100 void RE::Cleanup() {
101   if (re_full_ != NULL)         (*pcre_free)(re_full_);
102   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
103   if (error_ != &empty_string)  delete error_;
104 }
105 
106 
~RE()107 RE::~RE() {
108   Cleanup();
109 }
110 
111 
Compile(Anchor anchor)112 pcre* RE::Compile(Anchor anchor) {
113   // First, convert RE_Options into pcre options
114   int pcre_options = 0;
115   pcre_options = options_.all_options();
116 
117   // Special treatment for anchoring.  This is needed because at
118   // runtime pcre only provides an option for anchoring at the
119   // beginning of a string (unless you use offset).
120   //
121   // There are three types of anchoring we want:
122   //    UNANCHORED      Compile the original pattern, and use
123   //                    a pcre unanchored match.
124   //    ANCHOR_START    Compile the original pattern, and use
125   //                    a pcre anchored match.
126   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
127   //                    and use a pcre anchored match.
128 
129   const char* compile_error;
130   int eoffset;
131   pcre* re;
132   if (anchor != ANCHOR_BOTH) {
133     re = pcre_compile(pattern_.c_str(), pcre_options,
134                       &compile_error, &eoffset, NULL);
135   } else {
136     // Tack a '\z' at the end of RE.  Parenthesize it first so that
137     // the '\z' applies to all top-level alternatives in the regexp.
138     string wrapped = "(?:";  // A non-counting grouping operator
139     wrapped += pattern_;
140     wrapped += ")\\z";
141     re = pcre_compile(wrapped.c_str(), pcre_options,
142                       &compile_error, &eoffset, NULL);
143   }
144   if (re == NULL) {
145     if (error_ == &empty_string) error_ = new string(compile_error);
146   }
147   return re;
148 }
149 
150 /***** Matching interfaces *****/
151 
FullMatch(const StringPiece & text,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const152 bool RE::FullMatch(const StringPiece& text,
153                    const Arg& ptr1,
154                    const Arg& ptr2,
155                    const Arg& ptr3,
156                    const Arg& ptr4,
157                    const Arg& ptr5,
158                    const Arg& ptr6,
159                    const Arg& ptr7,
160                    const Arg& ptr8,
161                    const Arg& ptr9,
162                    const Arg& ptr10,
163                    const Arg& ptr11,
164                    const Arg& ptr12,
165                    const Arg& ptr13,
166                    const Arg& ptr14,
167                    const Arg& ptr15,
168                    const Arg& ptr16) const {
169   const Arg* args[kMaxArgs];
170   int n = 0;
171   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
172   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
173   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
174   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
175   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
176   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
177   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
178   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
179   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
180   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
181   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
182   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
183   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
184   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
185   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
186   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
187  done:
188 
189   int consumed;
190   int vec[kVecSize];
191   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
192 }
193 
PartialMatch(const StringPiece & text,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const194 bool RE::PartialMatch(const StringPiece& text,
195                       const Arg& ptr1,
196                       const Arg& ptr2,
197                       const Arg& ptr3,
198                       const Arg& ptr4,
199                       const Arg& ptr5,
200                       const Arg& ptr6,
201                       const Arg& ptr7,
202                       const Arg& ptr8,
203                       const Arg& ptr9,
204                       const Arg& ptr10,
205                       const Arg& ptr11,
206                       const Arg& ptr12,
207                       const Arg& ptr13,
208                       const Arg& ptr14,
209                       const Arg& ptr15,
210                       const Arg& ptr16) const {
211   const Arg* args[kMaxArgs];
212   int n = 0;
213   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
214   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
215   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
216   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
217   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
218   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
219   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
220   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
221   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
222   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
223   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
224   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
225   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
226   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
227   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
228   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
229  done:
230 
231   int consumed;
232   int vec[kVecSize];
233   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
234 }
235 
Consume(StringPiece * input,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const236 bool RE::Consume(StringPiece* input,
237                  const Arg& ptr1,
238                  const Arg& ptr2,
239                  const Arg& ptr3,
240                  const Arg& ptr4,
241                  const Arg& ptr5,
242                  const Arg& ptr6,
243                  const Arg& ptr7,
244                  const Arg& ptr8,
245                  const Arg& ptr9,
246                  const Arg& ptr10,
247                  const Arg& ptr11,
248                  const Arg& ptr12,
249                  const Arg& ptr13,
250                  const Arg& ptr14,
251                  const Arg& ptr15,
252                  const Arg& ptr16) const {
253   const Arg* args[kMaxArgs];
254   int n = 0;
255   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
256   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
257   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
258   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
259   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
260   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
261   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
262   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
263   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
264   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
265   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
266   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
267   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
268   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
269   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
270   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
271  done:
272 
273   int consumed;
274   int vec[kVecSize];
275   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
276                   args, n, vec, kVecSize)) {
277     input->remove_prefix(consumed);
278     return true;
279   } else {
280     return false;
281   }
282 }
283 
FindAndConsume(StringPiece * input,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const284 bool RE::FindAndConsume(StringPiece* input,
285                         const Arg& ptr1,
286                         const Arg& ptr2,
287                         const Arg& ptr3,
288                         const Arg& ptr4,
289                         const Arg& ptr5,
290                         const Arg& ptr6,
291                         const Arg& ptr7,
292                         const Arg& ptr8,
293                         const Arg& ptr9,
294                         const Arg& ptr10,
295                         const Arg& ptr11,
296                         const Arg& ptr12,
297                         const Arg& ptr13,
298                         const Arg& ptr14,
299                         const Arg& ptr15,
300                         const Arg& ptr16) const {
301   const Arg* args[kMaxArgs];
302   int n = 0;
303   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
304   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
305   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
306   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
307   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
308   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
309   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
310   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
311   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
312   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
313   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
314   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
315   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
316   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
317   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
318   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
319  done:
320 
321   int consumed;
322   int vec[kVecSize];
323   if (DoMatchImpl(*input, UNANCHORED, &consumed,
324                   args, n, vec, kVecSize)) {
325     input->remove_prefix(consumed);
326     return true;
327   } else {
328     return false;
329   }
330 }
331 
Replace(const StringPiece & rewrite,string * str) const332 bool RE::Replace(const StringPiece& rewrite,
333                  string *str) const {
334   int vec[kVecSize];
335   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
336   if (matches == 0)
337     return false;
338 
339   string s;
340   if (!Rewrite(&s, rewrite, *str, vec, matches))
341     return false;
342 
343   assert(vec[0] >= 0);
344   assert(vec[1] >= 0);
345   str->replace(vec[0], vec[1] - vec[0], s);
346   return true;
347 }
348 
349 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
350 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
351 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
352 
NewlineMode(int pcre_options)353 static int NewlineMode(int pcre_options) {
354   // TODO: if we can make it threadsafe, cache this var
355   int newline_mode = 0;
356   /* if (newline_mode) return newline_mode; */  // do this once it's cached
357   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
358                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
359     newline_mode = (pcre_options &
360                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
361                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
362   } else {
363     int newline;
364     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
365     if (newline == 10)
366       newline_mode = PCRE_NEWLINE_LF;
367     else if (newline == 13)
368       newline_mode = PCRE_NEWLINE_CR;
369     else if (newline == 3338)
370       newline_mode = PCRE_NEWLINE_CRLF;
371     else if (newline == -1)
372       newline_mode = PCRE_NEWLINE_ANY;
373     else if (newline == -2)
374       newline_mode = PCRE_NEWLINE_ANYCRLF;
375     else
376       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
377   }
378   return newline_mode;
379 }
380 
GlobalReplace(const StringPiece & rewrite,string * str) const381 int RE::GlobalReplace(const StringPiece& rewrite,
382                       string *str) const {
383   int count = 0;
384   int vec[kVecSize];
385   string out;
386   int start = 0;
387   bool last_match_was_empty_string = false;
388 
389   while (start <= static_cast<int>(str->length())) {
390     // If the previous match was for the empty string, we shouldn't
391     // just match again: we'll match in the same way and get an
392     // infinite loop.  Instead, we do the match in a special way:
393     // anchored -- to force another try at the same position --
394     // and with a flag saying that this time, ignore empty matches.
395     // If this special match returns, that means there's a non-empty
396     // match at this position as well, and we can continue.  If not,
397     // we do what perl does, and just advance by one.
398     // Notice that perl prints '@@@' for this;
399     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
400     int matches;
401     if (last_match_was_empty_string) {
402       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
403       if (matches <= 0) {
404         int matchend = start + 1;     // advance one character.
405         // If the current char is CR and we're in CRLF mode, skip LF too.
406         // Note it's better to call pcre_fullinfo() than to examine
407         // all_options(), since options_ could have changed bewteen
408         // compile-time and now, but this is simpler and safe enough.
409         // Modified by PH to add ANY and ANYCRLF.
410         if (matchend < static_cast<int>(str->length()) &&
411             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
412             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
413              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
414              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
415           matchend++;
416         }
417         // We also need to advance more than one char if we're in utf8 mode.
418 #ifdef SUPPORT_UTF8
419         if (options_.utf8()) {
420           while (matchend < static_cast<int>(str->length()) &&
421                  ((*str)[matchend] & 0xc0) == 0x80)
422             matchend++;
423         }
424 #endif
425         if (start < static_cast<int>(str->length()))
426           out.append(*str, start, matchend - start);
427         start = matchend;
428         last_match_was_empty_string = false;
429         continue;
430       }
431     } else {
432       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
433       if (matches <= 0)
434         break;
435     }
436     int matchstart = vec[0], matchend = vec[1];
437     assert(matchstart >= start);
438     assert(matchend >= matchstart);
439     out.append(*str, start, matchstart - start);
440     Rewrite(&out, rewrite, *str, vec, matches);
441     start = matchend;
442     count++;
443     last_match_was_empty_string = (matchstart == matchend);
444   }
445 
446   if (count == 0)
447     return 0;
448 
449   if (start < static_cast<int>(str->length()))
450     out.append(*str, start, str->length() - start);
451   swap(out, *str);
452   return count;
453 }
454 
Extract(const StringPiece & rewrite,const StringPiece & text,string * out) const455 bool RE::Extract(const StringPiece& rewrite,
456                  const StringPiece& text,
457                  string *out) const {
458   int vec[kVecSize];
459   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
460   if (matches == 0)
461     return false;
462   out->erase();
463   return Rewrite(out, rewrite, text, vec, matches);
464 }
465 
QuoteMeta(const StringPiece & unquoted)466 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
467   string result;
468 
469   // Escape any ascii character not in [A-Za-z_0-9].
470   //
471   // Note that it's legal to escape a character even if it has no
472   // special meaning in a regular expression -- so this function does
473   // that.  (This also makes it identical to the perl function of the
474   // same name; see `perldoc -f quotemeta`.)  The one exception is
475   // escaping NUL: rather than doing backslash + NUL, like perl does,
476   // we do '\0', because pcre itself doesn't take embedded NUL chars.
477   for (int ii = 0; ii < unquoted.size(); ++ii) {
478     // Note that using 'isalnum' here raises the benchmark time from
479     // 32ns to 58ns:
480     if (unquoted[ii] == '\0') {
481       result += "\\0";
482     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
483                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
484                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
485                unquoted[ii] != '_' &&
486                // If this is the part of a UTF8 or Latin1 character, we need
487                // to copy this byte without escaping.  Experimentally this is
488                // what works correctly with the regexp library.
489                !(unquoted[ii] & 128)) {
490       result += '\\';
491       result += unquoted[ii];
492     } else {
493       result += unquoted[ii];
494     }
495   }
496 
497   return result;
498 }
499 
500 /***** Actual matching and rewriting code *****/
501 
TryMatch(const StringPiece & text,int startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const502 int RE::TryMatch(const StringPiece& text,
503                  int startpos,
504                  Anchor anchor,
505                  bool empty_ok,
506                  int *vec,
507                  int vecsize) const {
508   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
509   if (re == NULL) {
510     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
511     return 0;
512   }
513 
514   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
515   if (options_.match_limit() > 0) {
516     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
517     extra.match_limit = options_.match_limit();
518   }
519   if (options_.match_limit_recursion() > 0) {
520     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
521     extra.match_limit_recursion = options_.match_limit_recursion();
522   }
523 
524   // int options = 0;
525   // Changed by PH as a result of bugzilla #1288
526   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
527 
528   if (anchor != UNANCHORED)
529     options |= PCRE_ANCHORED;
530   if (!empty_ok)
531     options |= PCRE_NOTEMPTY;
532 
533   int rc = pcre_exec(re,              // The regular expression object
534                      &extra,
535                      (text.data() == NULL) ? "" : text.data(),
536                      text.size(),
537                      startpos,
538                      options,
539                      vec,
540                      vecsize);
541 
542   // Handle errors
543   if (rc == PCRE_ERROR_NOMATCH) {
544     return 0;
545   } else if (rc < 0) {
546     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
547     //        re, pattern_.c_str());
548     return 0;
549   } else if (rc == 0) {
550     // pcre_exec() returns 0 as a special case when the number of
551     // capturing subpatterns exceeds the size of the vector.
552     // When this happens, there is a match and the output vector
553     // is filled, but we miss out on the positions of the extra subpatterns.
554     rc = vecsize / 2;
555   }
556 
557   return rc;
558 }
559 
DoMatchImpl(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const * args,int n,int * vec,int vecsize) const560 bool RE::DoMatchImpl(const StringPiece& text,
561                      Anchor anchor,
562                      int* consumed,
563                      const Arg* const* args,
564                      int n,
565                      int* vec,
566                      int vecsize) const {
567   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
568   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
569   assert(matches >= 0);  // TryMatch never returns negatives
570   if (matches == 0)
571     return false;
572 
573   *consumed = vec[1];
574 
575   if (n == 0 || args == NULL) {
576     // We are not interested in results
577     return true;
578   }
579 
580   if (NumberOfCapturingGroups() < n) {
581     // RE has fewer capturing groups than number of arg pointers passed in
582     return false;
583   }
584 
585   // If we got here, we must have matched the whole pattern.
586   // We do not need (can not do) any more checks on the value of 'matches' here
587   // -- see the comment for TryMatch.
588   for (int i = 0; i < n; i++) {
589     const int start = vec[2*(i+1)];
590     const int limit = vec[2*(i+1)+1];
591     if (!args[i]->Parse(text.data() + start, limit-start)) {
592       // TODO: Should we indicate what the error was?
593       return false;
594     }
595   }
596 
597   return true;
598 }
599 
DoMatch(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const args[],int n) const600 bool RE::DoMatch(const StringPiece& text,
601                  Anchor anchor,
602                  int* consumed,
603                  const Arg* const args[],
604                  int n) const {
605   assert(n >= 0);
606   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
607                                        // (as for kVecSize)
608   int space[21];   // use stack allocation for small vecsize (common case)
609   int* vec = vecsize <= 21 ? space : new int[vecsize];
610   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
611   if (vec != space) delete [] vec;
612   return retval;
613 }
614 
Rewrite(string * out,const StringPiece & rewrite,const StringPiece & text,int * vec,int veclen) const615 bool RE::Rewrite(string *out, const StringPiece &rewrite,
616                  const StringPiece &text, int *vec, int veclen) const {
617   for (const char *s = rewrite.data(), *end = s + rewrite.size();
618        s < end; s++) {
619     int c = *s;
620     if (c == '\\') {
621       c = *++s;
622       if (isdigit(c)) {
623         int n = (c - '0');
624         if (n >= veclen) {
625           //fprintf(stderr, requested group %d in regexp %.*s\n",
626           //        n, rewrite.size(), rewrite.data());
627           return false;
628         }
629         int start = vec[2 * n];
630         if (start >= 0)
631           out->append(text.data() + start, vec[2 * n + 1] - start);
632       } else if (c == '\\') {
633         *out += '\\';
634       } else {
635         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
636         //        rewrite.size(), rewrite.data());
637         return false;
638       }
639     } else {
640       *out += c;
641     }
642   }
643   return true;
644 }
645 
646 // Return the number of capturing subpatterns, or -1 if the
647 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const648 int RE::NumberOfCapturingGroups() const {
649   if (re_partial_ == NULL) return -1;
650 
651   int result;
652   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
653                                   NULL,         // We did not study the pattern
654                                   PCRE_INFO_CAPTURECOUNT,
655                                   &result);
656   assert(pcre_retval == 0);
657   return result;
658 }
659 
660 /***** Parsers for various types *****/
661 
parse_null(const char * str,int n,void * dest)662 bool Arg::parse_null(const char* str, int n, void* dest) {
663   (void)str;
664   (void)n;
665   // We fail if somebody asked us to store into a non-NULL void* pointer
666   return (dest == NULL);
667 }
668 
parse_string(const char * str,int n,void * dest)669 bool Arg::parse_string(const char* str, int n, void* dest) {
670   if (dest == NULL) return true;
671   reinterpret_cast<string*>(dest)->assign(str, n);
672   return true;
673 }
674 
parse_stringpiece(const char * str,int n,void * dest)675 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
676   if (dest == NULL) return true;
677   reinterpret_cast<StringPiece*>(dest)->set(str, n);
678   return true;
679 }
680 
parse_char(const char * str,int n,void * dest)681 bool Arg::parse_char(const char* str, int n, void* dest) {
682   if (n != 1) return false;
683   if (dest == NULL) return true;
684   *(reinterpret_cast<char*>(dest)) = str[0];
685   return true;
686 }
687 
parse_uchar(const char * str,int n,void * dest)688 bool Arg::parse_uchar(const char* str, int n, void* dest) {
689   if (n != 1) return false;
690   if (dest == NULL) return true;
691   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
692   return true;
693 }
694 
695 // Largest number spec that we are willing to parse
696 static const int kMaxNumberLength = 32;
697 
698 // REQUIRES "buf" must have length at least kMaxNumberLength+1
699 // REQUIRES "n > 0"
700 // Copies "str" into "buf" and null-terminates if necessary.
701 // Returns one of:
702 //      a. "str" if no termination is needed
703 //      b. "buf" if the string was copied and null-terminated
704 //      c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,int n)705 static const char* TerminateNumber(char* buf, const char* str, int n) {
706   if ((n > 0) && isspace(*str)) {
707     // We are less forgiving than the strtoxxx() routines and do not
708     // allow leading spaces.
709     return "";
710   }
711 
712   // See if the character right after the input text may potentially
713   // look like a digit.
714   if (isdigit(str[n]) ||
715       ((str[n] >= 'a') && (str[n] <= 'f')) ||
716       ((str[n] >= 'A') && (str[n] <= 'F'))) {
717     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
718     memcpy(buf, str, n);
719     buf[n] = '\0';
720     return buf;
721   } else {
722     // We can parse right out of the supplied string, so return it.
723     return str;
724   }
725 }
726 
parse_long_radix(const char * str,int n,void * dest,int radix)727 bool Arg::parse_long_radix(const char* str,
728                            int n,
729                            void* dest,
730                            int radix) {
731   if (n == 0) return false;
732   char buf[kMaxNumberLength+1];
733   str = TerminateNumber(buf, str, n);
734   char* end;
735   errno = 0;
736   long r = strtol(str, &end, radix);
737   if (end != str + n) return false;   // Leftover junk
738   if (errno) return false;
739   if (dest == NULL) return true;
740   *(reinterpret_cast<long*>(dest)) = r;
741   return true;
742 }
743 
parse_ulong_radix(const char * str,int n,void * dest,int radix)744 bool Arg::parse_ulong_radix(const char* str,
745                             int n,
746                             void* dest,
747                             int radix) {
748   if (n == 0) return false;
749   char buf[kMaxNumberLength+1];
750   str = TerminateNumber(buf, str, n);
751   if (str[0] == '-') return false;    // strtoul() on a negative number?!
752   char* end;
753   errno = 0;
754   unsigned long r = strtoul(str, &end, radix);
755   if (end != str + n) return false;   // Leftover junk
756   if (errno) return false;
757   if (dest == NULL) return true;
758   *(reinterpret_cast<unsigned long*>(dest)) = r;
759   return true;
760 }
761 
parse_short_radix(const char * str,int n,void * dest,int radix)762 bool Arg::parse_short_radix(const char* str,
763                             int n,
764                             void* dest,
765                             int radix) {
766   long r;
767   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
768   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
769   if (dest == NULL) return true;
770   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
771   return true;
772 }
773 
parse_ushort_radix(const char * str,int n,void * dest,int radix)774 bool Arg::parse_ushort_radix(const char* str,
775                              int n,
776                              void* dest,
777                              int radix) {
778   unsigned long r;
779   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
780   if (r > USHRT_MAX) return false;                      // Out of range
781   if (dest == NULL) return true;
782   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
783   return true;
784 }
785 
parse_int_radix(const char * str,int n,void * dest,int radix)786 bool Arg::parse_int_radix(const char* str,
787                           int n,
788                           void* dest,
789                           int radix) {
790   long r;
791   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
792   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
793   if (dest == NULL) return true;
794   *(reinterpret_cast<int*>(dest)) = r;
795   return true;
796 }
797 
parse_uint_radix(const char * str,int n,void * dest,int radix)798 bool Arg::parse_uint_radix(const char* str,
799                            int n,
800                            void* dest,
801                            int radix) {
802   unsigned long r;
803   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
804   if (r > UINT_MAX) return false;                       // Out of range
805   if (dest == NULL) return true;
806   *(reinterpret_cast<unsigned int*>(dest)) = r;
807   return true;
808 }
809 
parse_longlong_radix(const char * str,int n,void * dest,int radix)810 bool Arg::parse_longlong_radix(const char* str,
811                                int n,
812                                void* dest,
813                                int radix) {
814 #ifndef HAVE_LONG_LONG
815   return false;
816 #else
817   if (n == 0) return false;
818   char buf[kMaxNumberLength+1];
819   str = TerminateNumber(buf, str, n);
820   char* end;
821   errno = 0;
822 #if defined HAVE_STRTOQ
823   long long r = strtoq(str, &end, radix);
824 #elif defined HAVE_STRTOLL
825   long long r = strtoll(str, &end, radix);
826 #elif defined HAVE__STRTOI64
827   long long r = _strtoi64(str, &end, radix);
828 #elif defined HAVE_STRTOIMAX
829   long long r = strtoimax(str, &end, radix);
830 #else
831 #error parse_longlong_radix: cannot convert input to a long-long
832 #endif
833   if (end != str + n) return false;   // Leftover junk
834   if (errno) return false;
835   if (dest == NULL) return true;
836   *(reinterpret_cast<long long*>(dest)) = r;
837   return true;
838 #endif   /* HAVE_LONG_LONG */
839 }
840 
parse_ulonglong_radix(const char * str,int n,void * dest,int radix)841 bool Arg::parse_ulonglong_radix(const char* str,
842                                 int n,
843                                 void* dest,
844                                 int radix) {
845 #ifndef HAVE_UNSIGNED_LONG_LONG
846   return false;
847 #else
848   if (n == 0) return false;
849   char buf[kMaxNumberLength+1];
850   str = TerminateNumber(buf, str, n);
851   if (str[0] == '-') return false;    // strtoull() on a negative number?!
852   char* end;
853   errno = 0;
854 #if defined HAVE_STRTOQ
855   unsigned long long r = strtouq(str, &end, radix);
856 #elif defined HAVE_STRTOLL
857   unsigned long long r = strtoull(str, &end, radix);
858 #elif defined HAVE__STRTOI64
859   unsigned long long r = _strtoui64(str, &end, radix);
860 #elif defined HAVE_STRTOIMAX
861   unsigned long long r = strtoumax(str, &end, radix);
862 #else
863 #error parse_ulonglong_radix: cannot convert input to a long-long
864 #endif
865   if (end != str + n) return false;   // Leftover junk
866   if (errno) return false;
867   if (dest == NULL) return true;
868   *(reinterpret_cast<unsigned long long*>(dest)) = r;
869   return true;
870 #endif   /* HAVE_UNSIGNED_LONG_LONG */
871 }
872 
parse_double(const char * str,int n,void * dest)873 bool Arg::parse_double(const char* str, int n, void* dest) {
874   if (n == 0) return false;
875   static const int kMaxLength = 200;
876   char buf[kMaxLength];
877   if (n >= kMaxLength) return false;
878   memcpy(buf, str, n);
879   buf[n] = '\0';
880   errno = 0;
881   char* end;
882   double r = strtod(buf, &end);
883   if (end != buf + n) return false;   // Leftover junk
884   if (errno) return false;
885   if (dest == NULL) return true;
886   *(reinterpret_cast<double*>(dest)) = r;
887   return true;
888 }
889 
parse_float(const char * str,int n,void * dest)890 bool Arg::parse_float(const char* str, int n, void* dest) {
891   double r;
892   if (!parse_double(str, n, &r)) return false;
893   if (dest == NULL) return true;
894   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
895   return true;
896 }
897 
898 
899 #define DEFINE_INTEGER_PARSERS(name)                                    \
900   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
901     return parse_##name##_radix(str, n, dest, 10);                      \
902   }                                                                     \
903   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
904     return parse_##name##_radix(str, n, dest, 16);                      \
905   }                                                                     \
906   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
907     return parse_##name##_radix(str, n, dest, 8);                       \
908   }                                                                     \
909   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
910     return parse_##name##_radix(str, n, dest, 0);                       \
911   }
912 
913 DEFINE_INTEGER_PARSERS(short)      /*                                   */
914 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
915 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
916 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
917 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
918 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
919 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
920 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
921 
922 #undef DEFINE_INTEGER_PARSERS
923 
924 }   // namespace pcrecpp
925