• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*************************************************
2  *      Perl-Compatible Regular Expressions       *
3  *************************************************/
4  
5  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.
7  
8                         Written by Philip Hazel
9       Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016 University of Cambridge
11  
12  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without
14  modification, are permitted provided that the following conditions are met:
15  
16      * Redistributions of source code must retain the above copyright notice,
17        this list of conditions and the following disclaimer.
18  
19      * Redistributions in binary form must reproduce the above copyright
20        notice, this list of conditions and the following disclaimer in the
21        documentation and/or other materials provided with the distribution.
22  
23      * Neither the name of the University of Cambridge nor the names of its
24        contributors may be used to endorse or promote products derived from
25        this software without specific prior written permission.
26  
27  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  POSSIBILITY OF SUCH DAMAGE.
38  -----------------------------------------------------------------------------
39  */
40  
41  /* This module contains an internal function that is used to match an extended
42  class. It is used by pcre2_auto_possessify() and by both pcre2_match() and
43  pcre2_def_match(). */
44  
45  
46  #ifdef HAVE_CONFIG_H
47  #include "config.h"
48  #endif
49  
50  
51  #include "pcre2_internal.h"
52  
53  /*************************************************
54  *       Match character against an XCLASS        *
55  *************************************************/
56  
57  /* This function is called to match a character against an extended class that
58  might contain codepoints above 255 and/or Unicode properties.
59  
60  Arguments:
61    c           the character
62    data        points to the flag code unit of the XCLASS data
63    utf         TRUE if in UTF mode
64  
65  Returns:      TRUE if character matches, else FALSE
66  */
67  
68  BOOL
PRIV(xclass)69  PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
70  {
71  PCRE2_UCHAR t;
72  BOOL negated = (*data & XCL_NOT) != 0;
73  
74  #if PCRE2_CODE_UNIT_WIDTH == 8
75  /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
76  utf = TRUE;
77  #endif
78  
79  /* Code points < 256 are matched against a bitmap, if one is present. If not,
80  we still carry on, because there may be ranges that start below 256 in the
81  additional data. */
82  
83  if (c < 256)
84    {
85    if ((*data & XCL_HASPROP) == 0)
86      {
87      if ((*data & XCL_MAP) == 0) return negated;
88      return (((uint8_t *)(data + 1))[c/8] & (1 << (c&7))) != 0;
89      }
90    if ((*data & XCL_MAP) != 0 &&
91      (((uint8_t *)(data + 1))[c/8] & (1 << (c&7))) != 0)
92      return !negated; /* char found */
93    }
94  
95  /* First skip the bit map if present. Then match against the list of Unicode
96  properties or large chars or ranges that end with a large char. We won't ever
97  encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
98  
99  if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR);
100  
101  while ((t = *data++) != XCL_END)
102    {
103    uint32_t x, y;
104    if (t == XCL_SINGLE)
105      {
106  #ifdef SUPPORT_UNICODE
107      if (utf)
108        {
109        GETCHARINC(x, data); /* macro generates multiple statements */
110        }
111      else
112  #endif
113      x = *data++;
114      if (c == x) return !negated;
115      }
116    else if (t == XCL_RANGE)
117      {
118  #ifdef SUPPORT_UNICODE
119      if (utf)
120        {
121        GETCHARINC(x, data); /* macro generates multiple statements */
122        GETCHARINC(y, data); /* macro generates multiple statements */
123        }
124      else
125  #endif
126        {
127        x = *data++;
128        y = *data++;
129        }
130      if (c >= x && c <= y) return !negated;
131      }
132  
133  #ifdef SUPPORT_UNICODE
134    else  /* XCL_PROP & XCL_NOTPROP */
135      {
136      const ucd_record *prop = GET_UCD(c);
137      BOOL isprop = t == XCL_PROP;
138  
139      switch(*data)
140        {
141        case PT_ANY:
142        if (isprop) return !negated;
143        break;
144  
145        case PT_LAMP:
146        if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
147             prop->chartype == ucp_Lt) == isprop) return !negated;
148        break;
149  
150        case PT_GC:
151        if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
152          return !negated;
153        break;
154  
155        case PT_PC:
156        if ((data[1] == prop->chartype) == isprop) return !negated;
157        break;
158  
159        case PT_SC:
160        if ((data[1] == prop->script) == isprop) return !negated;
161        break;
162  
163        case PT_ALNUM:
164        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
165             PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
166          return !negated;
167        break;
168  
169        /* Perl space used to exclude VT, but from Perl 5.18 it is included,
170        which means that Perl space and POSIX space are now identical. PCRE
171        was changed at release 8.34. */
172  
173        case PT_SPACE:    /* Perl space */
174        case PT_PXSPACE:  /* POSIX space */
175        switch(c)
176          {
177          HSPACE_CASES:
178          VSPACE_CASES:
179          if (isprop) return !negated;
180          break;
181  
182          default:
183          if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
184            return !negated;
185          break;
186          }
187        break;
188  
189        case PT_WORD:
190        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
191             PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
192               == isprop)
193          return !negated;
194        break;
195  
196        case PT_UCNC:
197        if (c < 0xa0)
198          {
199          if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
200               c == CHAR_GRAVE_ACCENT) == isprop)
201            return !negated;
202          }
203        else
204          {
205          if ((c < 0xd800 || c > 0xdfff) == isprop)
206            return !negated;
207          }
208        break;
209  
210        /* The following three properties can occur only in an XCLASS, as there
211        is no \p or \P coding for them. */
212  
213        /* Graphic character. Implement this as not Z (space or separator) and
214        not C (other), except for Cf (format) with a few exceptions. This seems
215        to be what Perl does. The exceptional characters are:
216  
217        U+061C           Arabic Letter Mark
218        U+180E           Mongolian Vowel Separator
219        U+2066 - U+2069  Various "isolate"s
220        */
221  
222        case PT_PXGRAPH:
223        if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
224              (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
225                (prop->chartype == ucp_Cf &&
226                  c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
227           )) == isprop)
228          return !negated;
229        break;
230  
231        /* Printable character: same as graphic, with the addition of Zs, i.e.
232        not Zl and not Zp, and U+180E. */
233  
234        case PT_PXPRINT:
235        if ((prop->chartype != ucp_Zl &&
236             prop->chartype != ucp_Zp &&
237              (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
238                (prop->chartype == ucp_Cf &&
239                  c != 0x061c && (c < 0x2066 || c > 0x2069))
240           )) == isprop)
241          return !negated;
242        break;
243  
244        /* Punctuation: all Unicode punctuation, plus ASCII characters that
245        Unicode treats as symbols rather than punctuation, for Perl
246        compatibility (these are $+<=>^`|~). */
247  
248        case PT_PXPUNCT:
249        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
250              (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
251          return !negated;
252        break;
253  
254        /* This should never occur, but compilers may mutter if there is no
255        default. */
256  
257        default:
258        return FALSE;
259        }
260  
261      data += 2;
262      }
263  #else
264    (void)utf;  /* Avoid compiler warning */
265  #endif  /* SUPPORT_UNICODE */
266    }
267  
268  return negated;   /* char did not match */
269  }
270  
271  /* End of pcre2_xclass.c */
272