1 /* uwildmat.c is reused from libinn - https://launchpad.net/ubuntu/+source/inn2/2.5.4-1
2
3 This provides wild card matching originally used in InterNetNews and is
4 described in https://tools.ietf.org/html/rfc3977#section-4
5
6 INN licence:
7 INN as a whole and all code contained in it not otherwise marked with
8 different licenses and/or copyrights is covered by the following copyright
9 and license:
10
11 Copyright (c) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
12 2013, 2014 by Internet Systems Consortium, Inc. ("ISC")
13 Copyright (c) 1991, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
14 2002, 2003 by The Internet Software Consortium and Rich Salz
15
16 This code is derived from software contributed to the Internet Software
17 Consortium by Rich Salz.
18
19 Permission to use, copy, modify, and distribute this software for any
20 purpose with or without fee is hereby granted, provided that the above
21 copyright notice and this permission notice appear in all copies.
22
23 THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
24 REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
26 SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30
31 */
32
33 /* $Id: uwildmat.c 8918 2010-01-22 23:28:28Z iulius $
34 **
35 ** wildmat pattern matching with Unicode UTF-8 extensions.
36 **
37 ** Do shell-style pattern matching for ?, \, [], and * characters. Might not
38 ** be robust in face of malformed patterns; e.g., "foo[a-" could cause a
39 ** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed
40 ** July 2000; all malformed patterns should now just fail to match anything.)
41 **
42 ** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
43 ** Rich $alz is now <rsalz@osf.org>.
44 **
45 ** April, 1991: Replaced mutually-recursive calls with in-line code for the
46 ** star character.
47 **
48 ** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
49 ** This can greatly speed up failing wildcard patterns. For example:
50 **
51 ** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
52 ** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
53 ** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
54 **
55 ** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without
56 ** the ABORT code, it takes 22310 calls to fail. Ugh. The following
57 ** explanation is from Lars:
58 **
59 ** The precondition that must be fulfilled is that DoMatch will consume at
60 ** least one character in text. This is true if *p is neither '*' nor '\0'.)
61 ** The last return has ABORT instead of false to avoid quadratic behaviour in
62 ** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each
63 ** star-loop has to run to the end of the text; with ABORT only the last one
64 ** does.
65 **
66 ** Once the control of one instance of DoMatch enters the star-loop, that
67 ** instance will return either true or ABORT, and any calling instance will
68 ** therefore return immediately after (without calling recursively again).
69 ** In effect, only one star-loop is ever active. It would be possible to
70 ** modify the code to maintain this context explicitly, eliminating all
71 ** recursive calls at the cost of some complication and loss of clarity (and
72 ** the ABORT stuff seems to be unclear enough by itself). I think it would
73 ** be unwise to try to get this into a released version unless you have a
74 ** good test data base to try it out on.
75 **
76 ** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket
77 ** handling for character sets.
78 **
79 ** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add
80 ** support for ',', '!', and optionally '@' to the core wildmat routine.
81 ** Broke the character class matching into a separate function for clarity
82 ** since it's infrequently used in practice, and added some simple lookahead
83 ** to significantly decrease the recursive calls in the '*' matching code.
84 ** Added support for UTF-8 as the default character set for any high-bit
85 ** characters.
86 **
87 ** For more information on UTF-8, see RFC 3629.
88 **
89 ** Please note that this file is intentionally written so that conditionally
90 ** executed expressions are on separate lines from the condition to
91 ** facilitate analysis of the coverage of the test suite using purecov.
92 ** Please preserve this. As of March 11, 2001, purecov reports that the
93 ** accompanying test suite achieves 100% coverage of this file.
94 */
95
96 #include <ctype.h>
97 #include <string.h>
98 #include <stdint.h>
99 #include "uwildmat/uwildmat.h"
100
101 #define ABORT -1
102
103 /* Whether or not an octet looks like the start of a UTF-8 character. */
104 #define ISUTF8(c) (((c) & 0xc0) == 0xc0)
105
106
107 /*
108 ** Determine the length of a non-ASCII character in octets (for advancing
109 ** pointers when skipping over characters). Takes a pointer to the start of
110 ** the character and to the last octet of the string. If end is NULL, expect
111 ** the string pointed to by start to be nul-terminated. If the character is
112 ** malformed UTF-8, return 1 to treat it like an eight-bit local character.
113 */
114 static int
utf8_length(const unsigned char * start,const unsigned char * end)115 utf8_length(const unsigned char *start, const unsigned char *end)
116 {
117 unsigned char mask = 0x80;
118 const unsigned char *p;
119 int length = 0;
120 int left;
121
122 for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
123 length++;
124 if (length < 2 || length > 6)
125 return 1;
126 if (end != NULL && (end - start + 1) < length)
127 return 1;
128 left = length - 1;
129 for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
130 left--;
131 return (left == 0) ? length : 1;
132 }
133
134
135 /*
136 ** Check whether a string contains only valid UTF-8 characters.
137 */
138 bool
is_valid_utf8(const char * text)139 is_valid_utf8(const char *text)
140 {
141 unsigned char mask;
142 const unsigned char *p;
143 int length;
144 int left;
145
146 for (p = (const unsigned char *)text; *p != '\0';) {
147 mask = 0x80;
148 length = 0;
149
150 /* Find out the expected length of the character. */
151 for (; mask > 0 && (*p & mask) == mask; mask >>= 1)
152 length++;
153
154 p++;
155
156 /* Valid ASCII. */
157 if (length == 0)
158 continue;
159
160 /* Invalid length. */
161 if (length < 2 || length > 6)
162 return false;
163
164 /* Check that each byte looks like 10xxxxxx, except for the first. */
165 left = length - 1;
166 for (; left > 0 && (*p & 0xc0) == 0x80; p++)
167 left--;
168
169 if (left > 0)
170 return false;
171 }
172
173 return true;
174 }
175
176
177 /*
178 ** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the
179 ** character and to the last octet of the string, and to a uint32_t into
180 ** which to put the decoded UCS-4 value. If end is NULL, expect the string
181 ** pointed to by start to be nul-terminated. Returns the number of octets in
182 ** the UTF-8 encoding. If the UTF-8 character is malformed, set result to
183 ** the decimal value of the first octet; this is wrong, but it will generally
184 ** cause the rest of the wildmat matching to do the right thing for non-UTF-8
185 ** input.
186 */
187 static int
utf8_decode(const unsigned char * start,const unsigned char * end,uint32_t * result)188 utf8_decode(const unsigned char *start, const unsigned char *end,
189 uint32_t *result)
190 {
191 uint32_t value = 0;
192 int length, i;
193 const unsigned char *p = start;
194 unsigned char mask;
195
196 length = utf8_length(start, end);
197 if (length < 2) {
198 *result = *start;
199 return 1;
200 }
201 mask = (1 << (7 - length)) - 1;
202 value = *p & mask;
203 p++;
204 for (i = length - 1; i > 0; i--) {
205 value = (value << 6) | (*p & 0x3f);
206 p++;
207 }
208 *result = value;
209 return length;
210 }
211
212
213 /*
214 ** Match a character class against text, a UCS-4 character. start is a
215 ** pointer to the first character of the character class, end a pointer to
216 ** the last. Returns whether the class matches that character.
217 */
218 static bool
match_class(uint32_t text,const unsigned char * start,const unsigned char * end)219 match_class(uint32_t text, const unsigned char *start,
220 const unsigned char *end)
221 {
222 bool reversed, allowrange;
223 const unsigned char *p = start;
224 uint32_t first = 0;
225 uint32_t last;
226 uint32_t lc = tolower(text);
227
228 /* Check for an inverted character class (starting with ^). If the
229 character matches the character class, we return !reversed; that way,
230 we return true if it's a regular character class and false if it's a
231 reversed one. If the character doesn't match, we return reversed. */
232 reversed = (*p == '^');
233 if (reversed)
234 p++;
235
236 /* Walk through the character class until we reach the end or find a
237 match, handling character ranges as we go. Only permit a range to
238 start when allowrange is true; this allows - to be treated like a
239 normal character as the first character of the class and catches
240 malformed ranges like a-e-n. We treat the character at the beginning
241 of a range as both a regular member of the class and the beginning of
242 the range; this is harmless (although it means that malformed ranges
243 like m-a will match m and nothing else). */
244 allowrange = false;
245 while (p <= end) {
246 if (allowrange && *p == '-' && p < end) {
247 p++;
248 p += utf8_decode(p, end, &last);
249 if ((text >= first && text <= last) ||
250 (lc >= first && lc <= last))
251 return !reversed;
252 allowrange = false;
253 } else {
254 p += utf8_decode(p, end, &first);
255 if (text == first || lc == first)
256 return !reversed;
257 allowrange = true;
258 }
259 }
260 return reversed;
261 }
262
263
264 /*
265 ** Match the text against the pattern between start and end. This is a
266 ** single pattern; a leading ! or @ must already be taken care of, and
267 ** commas must be dealt with outside of this routine.
268 */
269 static int
match_pattern(const unsigned char * text,const unsigned char * start,const unsigned char * end)270 match_pattern(const unsigned char *text, const unsigned char *start,
271 const unsigned char *end)
272 {
273 const unsigned char *q, *endclass;
274 const unsigned char *p = start;
275 bool ismeta;
276 int matched, width;
277 uint32_t c;
278 unsigned char lc;
279
280 for (; p <= end; p++) {
281 if (!*text && *p != '*')
282 return ABORT;
283
284 switch (*p) {
285 case '\\':
286 if (!*++p)
287 return ABORT;
288 /* Fall through. */
289
290 default:
291 lc = tolower(*text);
292 if (*text++ != *p && lc != *p)
293 return false;
294 break;
295
296 case '?':
297 text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
298 break;
299
300 case '*':
301 /* Consecutive stars are equivalent to one. Advance pattern to
302 the character after the star. */
303 for (++p; *p == '*'; p++)
304 ;
305
306 /* A trailing star will match anything. */
307 if (p > end)
308 return true;
309
310 /* Basic algorithm: Recurse at each point where the * could
311 possibly match. If the match succeeds or aborts, return
312 immediately; otherwise, try the next position.
313
314 Optimization: If the character after the * in the pattern
315 isn't a metacharacter (the common case), then the * has to
316 consume characters at least up to the next occurrence of that
317 character in the text. Scan forward for those points rather
318 than recursing at every possible point to save the extra
319 function call overhead. */
320 ismeta = (*p == '[' || *p == '?' || *p == '\\');
321 while (*text) {
322 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
323 if (ismeta) {
324 matched = match_pattern(text, p, end);
325 text += width;
326 } else {
327 while (*text && *text != *p) {
328 text += width;
329 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
330 }
331 if (!*text)
332 return ABORT;
333 matched = match_pattern(++text, p + 1, end);
334 }
335 if (matched != false)
336 return matched;
337 }
338 return ABORT;
339
340 case '[':
341 /* Find the end of the character class, making sure not to pick
342 up a close bracket at the beginning of the class. */
343 p++;
344 q = p + (*p == '^') + 1;
345 if (q > end)
346 return ABORT;
347 endclass = memchr(q, ']', (size_t) (end - q + 1));
348 if (!endclass)
349 return ABORT;
350
351 /* Do the heavy lifting in another function for clarity, since
352 character classes are an uncommon case. */
353 text += utf8_decode(text, NULL, &c);
354 if (!match_class(c, p, endclass - 1))
355 return false;
356 p = endclass;
357 break;
358 }
359 }
360
361 return (*text == '\0');
362 }
363
364
365 /*
366 ** Takes text and a wildmat expression; a wildmat expression is a
367 ** comma-separated list of wildmat patterns, optionally preceded by ! to
368 ** invert the sense of the expression. Returns UWILDMAT_MATCH if that
369 ** expression matches the text, UWILDMAT_FAIL otherwise. If allowpoison is
370 ** set, allow @ to introduce a poison expression (the same as !, but if it
371 ** triggers the failed match the routine returns UWILDMAT_POISON instead).
372 */
373 static enum uwildmat
match_expression(const unsigned char * text,const unsigned char * start,bool allowpoison)374 match_expression(const unsigned char *text, const unsigned char *start,
375 bool allowpoison)
376 {
377 const unsigned char *end, *split;
378 const unsigned char *p = start;
379 bool reverse, escaped;
380 bool match = false;
381 bool poison = false;
382 bool poisoned = false;
383
384 /* Handle the empty expression separately, since otherwise end will be
385 set to an invalid pointer. */
386 if (!*p)
387 return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
388 end = start + strlen((const char *) start) - 1;
389
390 /* Main match loop. Find each comma that separates patterns, and attempt
391 to match the text with each pattern in order. The last matching
392 pattern determines whether the whole expression matches. */
393 for (; p <= end + 1; p = split + 1) {
394 if (allowpoison)
395 poison = (*p == '@');
396 reverse = (*p == '!') || poison;
397 if (reverse)
398 p++;
399
400 /* Find the first unescaped comma, if any. If there is none, split
401 will be one greater than end and point at the nul at the end of
402 the string. */
403 for (escaped = false, split = p; split <= end; split++) {
404 if (*split == '[') {
405 split++;
406 if (*split == ']')
407 split++;
408 while (split <= end && *split != ']')
409 split++;
410 }
411 if (*split == ',' && !escaped)
412 break;
413 escaped = (*split == '\\') ? !escaped : false;
414 }
415
416 /* Optimization: If match == !reverse and poison == poisoned, this
417 pattern can't change the result, so don't do any work. */
418 if (match == !reverse && poison == poisoned)
419 continue;
420 if (match_pattern(text, p, split - 1) == true) {
421 poisoned = poison;
422 match = !reverse;
423 }
424 }
425 if (poisoned)
426 return UWILDMAT_POISON;
427 return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
428 }
429
430
431 /*
432 ** User-level routine used for wildmats where @ should be treated as a
433 ** regular character.
434 */
435 bool
uwildmat(const char * text,const char * pat)436 uwildmat(const char *text, const char *pat)
437 {
438 const unsigned char *utext = (const unsigned char *) text;
439 const unsigned char *upat = (const unsigned char *) pat;
440
441 if (upat[0] == '*' && upat[1] == '\0')
442 return true;
443 else
444 return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
445 }
446
447
448 /*
449 ** User-level routine used for wildmats that support poison matches.
450 */
451 enum uwildmat
uwildmat_poison(const char * text,const char * pat)452 uwildmat_poison(const char *text, const char *pat)
453 {
454 const unsigned char *utext = (const unsigned char *) text;
455 const unsigned char *upat = (const unsigned char *) pat;
456
457 if (upat[0] == '*' && upat[1] == '\0')
458 return UWILDMAT_MATCH;
459 else
460 return match_expression(utext, upat, true);
461 }
462
463
464 /*
465 ** User-level routine for simple expressions (neither , nor ! are special).
466 */
467 bool
uwildmat_simple(const char * text,const char * pat)468 uwildmat_simple(const char *text, const char *pat)
469 {
470 const unsigned char *utext = (const unsigned char *) text;
471 const unsigned char *upat = (const unsigned char *) pat;
472 size_t length;
473
474 if (upat[0] == '*' && upat[1] == '\0')
475 return true;
476 else {
477 length = strlen(pat);
478 return (match_pattern(utext, upat, upat + length - 1) == true);
479 }
480 }
481