• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*************************************************
2  *      Perl-Compatible Regular Expressions       *
3  *************************************************/
4  
5  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.
7  
8                         Written by Philip Hazel
9             Copyright (c) 1997-2014 University of Cambridge
10  
11  -----------------------------------------------------------------------------
12  Redistribution and use in source and binary forms, with or without
13  modification, are permitted provided that the following conditions are met:
14  
15      * Redistributions of source code must retain the above copyright notice,
16        this list of conditions and the following disclaimer.
17  
18      * Redistributions in binary form must reproduce the above copyright
19        notice, this list of conditions and the following disclaimer in the
20        documentation and/or other materials provided with the distribution.
21  
22      * Neither the name of the University of Cambridge nor the names of its
23        contributors may be used to endorse or promote products derived from
24        this software without specific prior written permission.
25  
26  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  POSSIBILITY OF SUCH DAMAGE.
37  -----------------------------------------------------------------------------
38  */
39  
40  
41  /* This module is a wrapper that provides a POSIX API to the underlying PCRE
42  functions. */
43  
44  
45  #ifdef HAVE_CONFIG_H
46  #include "config.h"
47  #endif
48  
49  
50  /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for
51  compiling these functions. This must come before including pcreposix.h, where
52  they are set for an application (using these functions) if they have not
53  previously been set. */
54  
55  #if defined(_WIN32) && !defined(PCRE_STATIC)
56  #  define PCREPOSIX_EXP_DECL extern __declspec(dllexport)
57  #  define PCREPOSIX_EXP_DEFN __declspec(dllexport)
58  #endif
59  
60  /* We include pcre.h before pcre_internal.h so that the PCRE library functions
61  are declared as "import" for Windows by defining PCRE_EXP_DECL as "import".
62  This is needed even though pcre_internal.h itself includes pcre.h, because it
63  does so after it has set PCRE_EXP_DECL to "export" if it is not already set. */
64  
65  #include "pcre.h"
66  #include "pcre_internal.h"
67  #include "pcreposix.h"
68  
69  
70  /* Table to translate PCRE compile time error codes into POSIX error codes. */
71  
72  static const int eint[] = {
73    0,           /* no error */
74    REG_EESCAPE, /* \ at end of pattern */
75    REG_EESCAPE, /* \c at end of pattern */
76    REG_EESCAPE, /* unrecognized character follows \ */
77    REG_BADBR,   /* numbers out of order in {} quantifier */
78    /* 5 */
79    REG_BADBR,   /* number too big in {} quantifier */
80    REG_EBRACK,  /* missing terminating ] for character class */
81    REG_ECTYPE,  /* invalid escape sequence in character class */
82    REG_ERANGE,  /* range out of order in character class */
83    REG_BADRPT,  /* nothing to repeat */
84    /* 10 */
85    REG_BADRPT,  /* operand of unlimited repeat could match the empty string */
86    REG_ASSERT,  /* internal error: unexpected repeat */
87    REG_BADPAT,  /* unrecognized character after (? */
88    REG_BADPAT,  /* POSIX named classes are supported only within a class */
89    REG_EPAREN,  /* missing ) */
90    /* 15 */
91    REG_ESUBREG, /* reference to non-existent subpattern */
92    REG_INVARG,  /* erroffset passed as NULL */
93    REG_INVARG,  /* unknown option bit(s) set */
94    REG_EPAREN,  /* missing ) after comment */
95    REG_ESIZE,   /* parentheses nested too deeply */
96    /* 20 */
97    REG_ESIZE,   /* regular expression too large */
98    REG_ESPACE,  /* failed to get memory */
99    REG_EPAREN,  /* unmatched parentheses */
100    REG_ASSERT,  /* internal error: code overflow */
101    REG_BADPAT,  /* unrecognized character after (?< */
102    /* 25 */
103    REG_BADPAT,  /* lookbehind assertion is not fixed length */
104    REG_BADPAT,  /* malformed number or name after (?( */
105    REG_BADPAT,  /* conditional group contains more than two branches */
106    REG_BADPAT,  /* assertion expected after (?( */
107    REG_BADPAT,  /* (?R or (?[+-]digits must be followed by ) */
108    /* 30 */
109    REG_ECTYPE,  /* unknown POSIX class name */
110    REG_BADPAT,  /* POSIX collating elements are not supported */
111    REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UTF8 support */
112    REG_BADPAT,  /* spare error */
113    REG_BADPAT,  /* character value in \x{} or \o{} is too large */
114    /* 35 */
115    REG_BADPAT,  /* invalid condition (?(0) */
116    REG_BADPAT,  /* \C not allowed in lookbehind assertion */
117    REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */
118    REG_BADPAT,  /* number after (?C is > 255 */
119    REG_BADPAT,  /* closing ) for (?C expected */
120    /* 40 */
121    REG_BADPAT,  /* recursive call could loop indefinitely */
122    REG_BADPAT,  /* unrecognized character after (?P */
123    REG_BADPAT,  /* syntax error in subpattern name (missing terminator) */
124    REG_BADPAT,  /* two named subpatterns have the same name */
125    REG_BADPAT,  /* invalid UTF-8 string */
126    /* 45 */
127    REG_BADPAT,  /* support for \P, \p, and \X has not been compiled */
128    REG_BADPAT,  /* malformed \P or \p sequence */
129    REG_BADPAT,  /* unknown property name after \P or \p */
130    REG_BADPAT,  /* subpattern name is too long (maximum 32 characters) */
131    REG_BADPAT,  /* too many named subpatterns (maximum 10,000) */
132    /* 50 */
133    REG_BADPAT,  /* repeated subpattern is too long */
134    REG_BADPAT,  /* octal value is greater than \377 (not in UTF-8 mode) */
135    REG_BADPAT,  /* internal error: overran compiling workspace */
136    REG_BADPAT,  /* internal error: previously-checked referenced subpattern not found */
137    REG_BADPAT,  /* DEFINE group contains more than one branch */
138    /* 55 */
139    REG_BADPAT,  /* repeating a DEFINE group is not allowed */
140    REG_INVARG,  /* inconsistent NEWLINE options */
141    REG_BADPAT,  /* \g is not followed followed by an (optionally braced) non-zero number */
142    REG_BADPAT,  /* a numbered reference must not be zero */
143    REG_BADPAT,  /* an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT) */
144    /* 60 */
145    REG_BADPAT,  /* (*VERB) not recognized */
146    REG_BADPAT,  /* number is too big */
147    REG_BADPAT,  /* subpattern name expected */
148    REG_BADPAT,  /* digit expected after (?+ */
149    REG_BADPAT,  /* ] is an invalid data character in JavaScript compatibility mode */
150    /* 65 */
151    REG_BADPAT,  /* different names for subpatterns of the same number are not allowed */
152    REG_BADPAT,  /* (*MARK) must have an argument */
153    REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UCP support */
154    REG_BADPAT,  /* \c must be followed by an ASCII character */
155    REG_BADPAT,  /* \k is not followed by a braced, angle-bracketed, or quoted name */
156    /* 70 */
157    REG_BADPAT,  /* internal error: unknown opcode in find_fixedlength() */
158    REG_BADPAT,  /* \N is not supported in a class */
159    REG_BADPAT,  /* too many forward references */
160    REG_BADPAT,  /* disallowed UTF-8/16/32 code point (>= 0xd800 && <= 0xdfff) */
161    REG_BADPAT,  /* invalid UTF-16 string (should not occur) */
162    /* 75 */
163    REG_BADPAT,  /* overlong MARK name */
164    REG_BADPAT,  /* character value in \u.... sequence is too large */
165    REG_BADPAT,  /* invalid UTF-32 string (should not occur) */
166    REG_BADPAT,  /* setting UTF is disabled by the application */
167    REG_BADPAT,  /* non-hex character in \\x{} (closing brace missing?) */
168    /* 80 */
169    REG_BADPAT,  /* non-octal character in \o{} (closing brace missing?) */
170    REG_BADPAT,  /* missing opening brace after \o */
171    REG_BADPAT,  /* parentheses too deeply nested */
172    REG_BADPAT,  /* invalid range in character class */
173    REG_BADPAT,  /* group name must start with a non-digit */
174    /* 85 */
175    REG_BADPAT,  /* parentheses too deeply nested (stack check) */
176    REG_BADPAT   /* missing digits in \x{} or \o{} */
177  };
178  
179  /* Table of texts corresponding to POSIX error codes */
180  
181  static const char *const pstring[] = {
182    "",                                /* Dummy for value 0 */
183    "internal error",                  /* REG_ASSERT */
184    "invalid repeat counts in {}",     /* BADBR      */
185    "pattern error",                   /* BADPAT     */
186    "? * + invalid",                   /* BADRPT     */
187    "unbalanced {}",                   /* EBRACE     */
188    "unbalanced []",                   /* EBRACK     */
189    "collation error - not relevant",  /* ECOLLATE   */
190    "bad class",                       /* ECTYPE     */
191    "bad escape sequence",             /* EESCAPE    */
192    "empty expression",                /* EMPTY      */
193    "unbalanced ()",                   /* EPAREN     */
194    "bad range inside []",             /* ERANGE     */
195    "expression too big",              /* ESIZE      */
196    "failed to get memory",            /* ESPACE     */
197    "bad back reference",              /* ESUBREG    */
198    "bad argument",                    /* INVARG     */
199    "match failed"                     /* NOMATCH    */
200  };
201  
202  
203  
204  
205  /*************************************************
206  *          Translate error code to string        *
207  *************************************************/
208  
209  PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION
regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)210  regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
211  {
212  const char *message, *addmessage;
213  size_t length, addlength;
214  
215  message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
216    "unknown error code" : pstring[errcode];
217  length = strlen(message) + 1;
218  
219  addmessage = " at offset ";
220  addlength = (preg != NULL && (int)preg->re_erroffset != -1)?
221    strlen(addmessage) + 6 : 0;
222  
223  if (errbuf_size > 0)
224    {
225    if (addlength > 0 && errbuf_size >= length + addlength)
226      sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
227    else
228      {
229      strncpy(errbuf, message, errbuf_size - 1);
230      errbuf[errbuf_size-1] = 0;
231      }
232    }
233  
234  return length + addlength;
235  }
236  
237  
238  
239  
240  /*************************************************
241  *           Free store held by a regex           *
242  *************************************************/
243  
244  PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION
regfree(regex_t * preg)245  regfree(regex_t *preg)
246  {
247  (PUBL(free))(preg->re_pcre);
248  }
249  
250  
251  
252  
253  /*************************************************
254  *            Compile a regular expression        *
255  *************************************************/
256  
257  /*
258  Arguments:
259    preg        points to a structure for recording the compiled expression
260    pattern     the pattern to compile
261    cflags      compilation flags
262  
263  Returns:      0 on success
264                various non-zero codes on failure
265  */
266  
267  PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
regcomp(regex_t * preg,const char * pattern,int cflags)268  regcomp(regex_t *preg, const char *pattern, int cflags)
269  {
270  const char *errorptr;
271  int erroffset;
272  int errorcode;
273  int options = 0;
274  int re_nsub = 0;
275  
276  if ((cflags & REG_ICASE) != 0)    options |= PCRE_CASELESS;
277  if ((cflags & REG_NEWLINE) != 0)  options |= PCRE_MULTILINE;
278  if ((cflags & REG_DOTALL) != 0)   options |= PCRE_DOTALL;
279  if ((cflags & REG_NOSUB) != 0)    options |= PCRE_NO_AUTO_CAPTURE;
280  if ((cflags & REG_UTF8) != 0)     options |= PCRE_UTF8;
281  if ((cflags & REG_UCP) != 0)      options |= PCRE_UCP;
282  if ((cflags & REG_UNGREEDY) != 0) options |= PCRE_UNGREEDY;
283  
284  preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr,
285    &erroffset, NULL);
286  preg->re_erroffset = erroffset;
287  
288  /* Safety: if the error code is too big for the translation vector (which
289  should not happen, but we all make mistakes), return REG_BADPAT. */
290  
291  if (preg->re_pcre == NULL)
292    {
293    return (errorcode < (int)(sizeof(eint)/sizeof(const int)))?
294      eint[errorcode] : REG_BADPAT;
295    }
296  
297  (void)pcre_fullinfo((const pcre *)preg->re_pcre, NULL, PCRE_INFO_CAPTURECOUNT,
298    &re_nsub);
299  preg->re_nsub = (size_t)re_nsub;
300  return 0;
301  }
302  
303  
304  
305  
306  /*************************************************
307  *              Match a regular expression        *
308  *************************************************/
309  
310  /* Unfortunately, PCRE requires 3 ints of working space for each captured
311  substring, so we have to get and release working store instead of just using
312  the POSIX structures as was done in earlier releases when PCRE needed only 2
313  ints. However, if the number of possible capturing brackets is small, use a
314  block of store on the stack, to reduce the use of malloc/free. The threshold is
315  in a macro that can be changed at configure time.
316  
317  If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will
318  be set. When this is the case, the nmatch and pmatch arguments are ignored, and
319  the only result is yes/no/error. */
320  
321  PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
regexec(const regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)322  regexec(const regex_t *preg, const char *string, size_t nmatch,
323    regmatch_t pmatch[], int eflags)
324  {
325  int rc, so, eo;
326  int options = 0;
327  int *ovector = NULL;
328  int small_ovector[POSIX_MALLOC_THRESHOLD * 3];
329  BOOL allocated_ovector = FALSE;
330  BOOL nosub =
331    (REAL_PCRE_OPTIONS((const pcre *)preg->re_pcre) & PCRE_NO_AUTO_CAPTURE) != 0;
332  
333  if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL;
334  if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL;
335  if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE_NOTEMPTY;
336  
337  ((regex_t *)preg)->re_erroffset = (size_t)(-1);  /* Only has meaning after compile */
338  
339  /* When no string data is being returned, or no vector has been passed in which
340  to put it, ensure that nmatch is zero. Otherwise, ensure the vector for holding
341  the return data is large enough. */
342  
343  if (nosub || pmatch == NULL) nmatch = 0;
344  
345  else if (nmatch > 0)
346    {
347    if (nmatch <= POSIX_MALLOC_THRESHOLD)
348      {
349      ovector = &(small_ovector[0]);
350      }
351    else
352      {
353      if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE;
354      ovector = (int *)malloc(sizeof(int) * nmatch * 3);
355      if (ovector == NULL) return REG_ESPACE;
356      allocated_ovector = TRUE;
357      }
358    }
359  
360  /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings.
361  The man page from OS X says "REG_STARTEND affects only the location of the
362  string, not how it is matched". That is why the "so" value is used to bump the
363  start location rather than being passed as a PCRE "starting offset". */
364  
365  if ((eflags & REG_STARTEND) != 0)
366    {
367    so = pmatch[0].rm_so;
368    eo = pmatch[0].rm_eo;
369    }
370  else
371    {
372    so = 0;
373    eo = (int)strlen(string);
374    }
375  
376  rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so),
377    0, options, ovector, (int)(nmatch * 3));
378  
379  if (rc == 0) rc = (int)nmatch;    /* All captured slots were filled in */
380  
381  /* Successful match */
382  
383  if (rc >= 0)
384    {
385    size_t i;
386    if (!nosub)
387      {
388      for (i = 0; i < (size_t)rc; i++)
389        {
390        pmatch[i].rm_so = ovector[i*2];
391        pmatch[i].rm_eo = ovector[i*2+1];
392        }
393      if (allocated_ovector) free(ovector);
394      for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
395      }
396    return 0;
397    }
398  
399  /* Unsuccessful match */
400  
401  if (allocated_ovector) free(ovector);
402  switch(rc)
403    {
404  /* ========================================================================== */
405    /* These cases are never obeyed. This is a fudge that causes a compile-time
406    error if the vector eint, which is indexed by compile-time error number, is
407    not the correct length. It seems to be the only way to do such a check at
408    compile time, as the sizeof() operator does not work in the C preprocessor.
409    As all the PCRE_ERROR_xxx values are negative, we can use 0 and 1. */
410  
411    case 0:
412    case (sizeof(eint)/sizeof(int) == ERRCOUNT):
413    return REG_ASSERT;
414  /* ========================================================================== */
415  
416    case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
417    case PCRE_ERROR_NULL: return REG_INVARG;
418    case PCRE_ERROR_BADOPTION: return REG_INVARG;
419    case PCRE_ERROR_BADMAGIC: return REG_INVARG;
420    case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
421    case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
422    case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
423    case PCRE_ERROR_BADUTF8: return REG_INVARG;
424    case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
425    case PCRE_ERROR_BADMODE: return REG_INVARG;
426    default: return REG_ASSERT;
427    }
428  }
429  
430  /* End of pcreposix.c */
431