1 /* uwildmat.c is reused from libinn - https://launchpad.net/ubuntu/+source/inn2/2.5.4-1
2 
3    This provides wild card matching originally used in InterNetNews and is
4    described in https://tools.ietf.org/html/rfc3977#section-4
5 
6    INN licence:
7    INN as a whole and all code contained in it not otherwise marked with
8    different licenses and/or copyrights is covered by the following copyright
9    and license:
10 
11    Copyright (c) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
12    2013, 2014 by Internet Systems Consortium, Inc. ("ISC")
13    Copyright (c) 1991, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
14    2002, 2003 by The Internet Software Consortium and Rich Salz
15 
16    This code is derived from software contributed to the Internet Software
17    Consortium by Rich Salz.
18 
19    Permission to use, copy, modify, and distribute this software for any
20    purpose with or without fee is hereby granted, provided that the above
21    copyright notice and this permission notice appear in all copies.
22 
23    THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
24    REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
25    MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY
26    SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
27    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
28    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
29    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 
31 */
32 
33 /*  $Id: uwildmat.c 8918 2010-01-22 23:28:28Z iulius $
34  **
35  **  wildmat pattern matching with Unicode UTF-8 extensions.
36  **
37  **  Do shell-style pattern matching for ?, \, [], and * characters.  Might not
38  **  be robust in face of malformed patterns; e.g., "foo[a-" could cause a
39  **  segmentation violation.  It is 8-bit clean.  (Robustness hopefully fixed
40  **  July 2000; all malformed patterns should now just fail to match anything.)
41  **
42  **  Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
43  **  Rich $alz is now <rsalz@osf.org>.
44  **
45  **  April, 1991:  Replaced mutually-recursive calls with in-line code for the
46  **  star character.
47  **
48  **  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
49  **  This can greatly speed up failing wildcard patterns.  For example:
50  **
51  **	pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
52  **	text 1:	 -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
53  **	text 2:	 -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
54  **
55  **  Text 1 matches with 51 calls, while text 2 fails with 54 calls.  Without
56  **  the ABORT code, it takes 22310 calls to fail.  Ugh.  The following
57  **  explanation is from Lars:
58  **
59  **  The precondition that must be fulfilled is that DoMatch will consume at
60  **  least one character in text.  This is true if *p is neither '*' nor '\0'.)
61  **  The last return has ABORT instead of false to avoid quadratic behaviour in
62  **  cases like pattern "*a*b*c*d" with text "abcxxxxx".  With false, each
63  **  star-loop has to run to the end of the text; with ABORT only the last one
64  **  does.
65  **
66  **  Once the control of one instance of DoMatch enters the star-loop, that
67  **  instance will return either true or ABORT, and any calling instance will
68  **  therefore return immediately after (without calling recursively again).
69  **  In effect, only one star-loop is ever active.  It would be possible to
70  **  modify the code to maintain this context explicitly, eliminating all
71  **  recursive calls at the cost of some complication and loss of clarity (and
72  **  the ABORT stuff seems to be unclear enough by itself).  I think it would
73  **  be unwise to try to get this into a released version unless you have a
74  **  good test data base to try it out on.
75  **
76  **  June, 1991:  Robert Elz <kre@munnari.oz.au> added minus and close bracket
77  **  handling for character sets.
78  **
79  **  July, 2000:  Largely rewritten by Russ Allbery <rra@stanford.edu> to add
80  **  support for ',', '!', and optionally '@' to the core wildmat routine.
81  **  Broke the character class matching into a separate function for clarity
82  **  since it's infrequently used in practice, and added some simple lookahead
83  **  to significantly decrease the recursive calls in the '*' matching code.
84  **  Added support for UTF-8 as the default character set for any high-bit
85  **  characters.
86  **
87  **  For more information on UTF-8, see RFC 3629.
88  **
89  **  Please note that this file is intentionally written so that conditionally
90  **  executed expressions are on separate lines from the condition to
91  **  facilitate analysis of the coverage of the test suite using purecov.
92  **  Please preserve this.  As of March 11, 2001, purecov reports that the
93  **  accompanying test suite achieves 100% coverage of this file.
94  */
95 
96 #include <ctype.h>
97 #include <string.h>
98 #include <stdint.h>
99 #include "uwildmat/uwildmat.h"
100 
101 #define ABORT -1
102 
103 /* Whether or not an octet looks like the start of a UTF-8 character. */
104 #define ISUTF8(c)       (((c) & 0xc0) == 0xc0)
105 
106 
107 /*
108  **  Determine the length of a non-ASCII character in octets (for advancing
109  **  pointers when skipping over characters).  Takes a pointer to the start of
110  **  the character and to the last octet of the string.  If end is NULL, expect
111  **  the string pointed to by start to be nul-terminated.  If the character is
112  **  malformed UTF-8, return 1 to treat it like an eight-bit local character.
113  */
114 static int
utf8_length(const unsigned char * start,const unsigned char * end)115 utf8_length(const unsigned char *start, const unsigned char *end)
116 {
117 	unsigned char mask = 0x80;
118 	const unsigned char *p;
119 	int length = 0;
120 	int left;
121 
122 	for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
123 		length++;
124 	if (length < 2 || length > 6)
125 		return 1;
126 	if (end != NULL && (end - start + 1) < length)
127 		return 1;
128 	left = length - 1;
129 	for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
130 		left--;
131 	return (left == 0) ? length : 1;
132 }
133 
134 
135 /*
136  **  Check whether a string contains only valid UTF-8 characters.
137  */
138 bool
is_valid_utf8(const char * text)139 is_valid_utf8(const char *text)
140 {
141 	unsigned char mask;
142 	const unsigned char *p;
143 	int length;
144 	int left;
145 
146 	for (p = (const unsigned char *)text; *p != '\0';) {
147 		mask = 0x80;
148 		length = 0;
149 
150 		/* Find out the expected length of the character. */
151 		for (; mask > 0 && (*p & mask) == mask; mask >>= 1)
152 			length++;
153 
154 		p++;
155 
156 		/* Valid ASCII. */
157 		if (length == 0)
158 			continue;
159 
160 		/* Invalid length. */
161 		if (length < 2 || length > 6)
162 			return false;
163 
164 		/* Check that each byte looks like 10xxxxxx, except for the first. */
165 		left = length - 1;
166 		for (; left > 0 && (*p & 0xc0) == 0x80; p++)
167 			left--;
168 
169 		if (left > 0)
170 			return false;
171 	}
172 
173 	return true;
174 }
175 
176 
177 /*
178  **  Convert a UTF-8 character to UCS-4.  Takes a pointer to the start of the
179  **  character and to the last octet of the string, and to a uint32_t into
180  **  which to put the decoded UCS-4 value.  If end is NULL, expect the string
181  **  pointed to by start to be nul-terminated.  Returns the number of octets in
182  **  the UTF-8 encoding.  If the UTF-8 character is malformed, set result to
183  **  the decimal value of the first octet; this is wrong, but it will generally
184  **  cause the rest of the wildmat matching to do the right thing for non-UTF-8
185  **  input.
186  */
187 static int
utf8_decode(const unsigned char * start,const unsigned char * end,uint32_t * result)188 utf8_decode(const unsigned char *start, const unsigned char *end,
189 	    uint32_t *result)
190 {
191 	uint32_t value = 0;
192 	int length, i;
193 	const unsigned char *p = start;
194 	unsigned char mask;
195 
196 	length = utf8_length(start, end);
197 	if (length < 2) {
198 		*result = *start;
199 		return 1;
200 	}
201 	mask = (1 << (7 - length)) - 1;
202 	value = *p & mask;
203 	p++;
204 	for (i = length - 1; i > 0; i--) {
205 		value = (value << 6) | (*p & 0x3f);
206 		p++;
207 	}
208 	*result = value;
209 	return length;
210 }
211 
212 
213 /*
214  **  Match a character class against text, a UCS-4 character.  start is a
215  **  pointer to the first character of the character class, end a pointer to
216  **  the last.  Returns whether the class matches that character.
217  */
218 static bool
match_class(uint32_t text,const unsigned char * start,const unsigned char * end)219 match_class(uint32_t text, const unsigned char *start,
220 	    const unsigned char *end)
221 {
222 	bool reversed, allowrange;
223 	const unsigned char *p = start;
224 	uint32_t first = 0;
225 	uint32_t last;
226 	uint32_t lc = tolower(text);
227 
228 	/* Check for an inverted character class (starting with ^).  If the
229 	   character matches the character class, we return !reversed; that way,
230 	   we return true if it's a regular character class and false if it's a
231 	   reversed one.  If the character doesn't match, we return reversed. */
232 	reversed = (*p == '^');
233 	if (reversed)
234 		p++;
235 
236 	/* Walk through the character class until we reach the end or find a
237 	   match, handling character ranges as we go.  Only permit a range to
238 	   start when allowrange is true; this allows - to be treated like a
239 	   normal character as the first character of the class and catches
240 	   malformed ranges like a-e-n.  We treat the character at the beginning
241 	   of a range as both a regular member of the class and the beginning of
242 	   the range; this is harmless (although it means that malformed ranges
243 	   like m-a will match m and nothing else). */
244 	allowrange = false;
245 	while (p <= end) {
246 		if (allowrange && *p == '-' && p < end) {
247 			p++;
248 			p += utf8_decode(p, end, &last);
249 			if ((text >= first && text <= last) ||
250 			    (lc >= first && lc <= last))
251 				return !reversed;
252 			allowrange = false;
253 		} else {
254 			p += utf8_decode(p, end, &first);
255 			if (text == first || lc == first)
256 				return !reversed;
257 			allowrange = true;
258 		}
259 	}
260 	return reversed;
261 }
262 
263 
264 /*
265  **  Match the text against the pattern between start and end.  This is a
266  **  single pattern; a leading ! or @ must already be taken care of, and
267  **  commas must be dealt with outside of this routine.
268  */
269 static int
match_pattern(const unsigned char * text,const unsigned char * start,const unsigned char * end)270 match_pattern(const unsigned char *text, const unsigned char *start,
271 	      const unsigned char *end)
272 {
273 	const unsigned char *q, *endclass;
274 	const unsigned char *p = start;
275 	bool ismeta;
276 	int matched, width;
277 	uint32_t c;
278 	unsigned char lc;
279 
280 	for (; p <= end; p++) {
281 		if (!*text && *p != '*')
282 			return ABORT;
283 
284 		switch (*p) {
285 		case '\\':
286 			if (!*++p)
287 				return ABORT;
288 			/* Fall through. */
289 
290 		default:
291 			lc = tolower(*text);
292 			if (*text++ != *p && lc != *p)
293 				return false;
294 			break;
295 
296 		case '?':
297 			text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
298 			break;
299 
300 		case '*':
301 			/* Consecutive stars are equivalent to one.  Advance pattern to
302 			   the character after the star. */
303 			for (++p; *p == '*'; p++)
304 				;
305 
306 			/* A trailing star will match anything. */
307 			if (p > end)
308 				return true;
309 
310 			/* Basic algorithm: Recurse at each point where the * could
311 			   possibly match.  If the match succeeds or aborts, return
312 			   immediately; otherwise, try the next position.
313 
314 Optimization: If the character after the * in the pattern
315 isn't a metacharacter (the common case), then the * has to
316 consume characters at least up to the next occurrence of that
317 character in the text.  Scan forward for those points rather
318 than recursing at every possible point to save the extra
319 function call overhead. */
320 			ismeta = (*p == '[' || *p == '?' || *p == '\\');
321 			while (*text) {
322 				width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
323 				if (ismeta) {
324 					matched = match_pattern(text, p, end);
325 					text += width;
326 				} else {
327 					while (*text && *text != *p) {
328 						text += width;
329 						width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
330 					}
331 					if (!*text)
332 						return ABORT;
333 					matched = match_pattern(++text, p + 1, end);
334 				}
335 				if (matched != false)
336 					return matched;
337 			}
338 			return ABORT;
339 
340 		case '[':
341 			/* Find the end of the character class, making sure not to pick
342 			   up a close bracket at the beginning of the class. */
343 			p++;
344 			q = p + (*p == '^') + 1;
345 			if (q > end)
346 				return ABORT;
347 			endclass = memchr(q, ']', (size_t) (end - q + 1));
348 			if (!endclass)
349 				return ABORT;
350 
351 			/* Do the heavy lifting in another function for clarity, since
352 			   character classes are an uncommon case. */
353 			text += utf8_decode(text, NULL, &c);
354 			if (!match_class(c, p, endclass - 1))
355 				return false;
356 			p = endclass;
357 			break;
358 		}
359 	}
360 
361 	return (*text == '\0');
362 }
363 
364 
365 /*
366  **  Takes text and a wildmat expression; a wildmat expression is a
367  **  comma-separated list of wildmat patterns, optionally preceded by ! to
368  **  invert the sense of the expression.  Returns UWILDMAT_MATCH if that
369  **  expression matches the text, UWILDMAT_FAIL otherwise.  If allowpoison is
370  **  set, allow @ to introduce a poison expression (the same as !, but if it
371  **  triggers the failed match the routine returns UWILDMAT_POISON instead).
372  */
373 static enum uwildmat
match_expression(const unsigned char * text,const unsigned char * start,bool allowpoison)374 match_expression(const unsigned char *text, const unsigned char *start,
375 		 bool allowpoison)
376 {
377 	const unsigned char *end, *split;
378 	const unsigned char *p = start;
379 	bool reverse, escaped;
380 	bool match = false;
381 	bool poison = false;
382 	bool poisoned = false;
383 
384 	/* Handle the empty expression separately, since otherwise end will be
385 	   set to an invalid pointer. */
386 	if (!*p)
387 		return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
388 	end = start + strlen((const char *) start) - 1;
389 
390 	/* Main match loop.  Find each comma that separates patterns, and attempt
391 	   to match the text with each pattern in order.  The last matching
392 	   pattern determines whether the whole expression matches. */
393 	for (; p <= end + 1; p = split + 1) {
394 		if (allowpoison)
395 			poison = (*p == '@');
396 		reverse = (*p == '!') || poison;
397 		if (reverse)
398 			p++;
399 
400 		/* Find the first unescaped comma, if any.  If there is none, split
401 		   will be one greater than end and point at the nul at the end of
402 		   the string. */
403 		for (escaped = false, split = p; split <= end; split++) {
404 			if (*split == '[') {
405 				split++;
406 				if (*split == ']')
407 					split++;
408 				while (split <= end && *split != ']')
409 					split++;
410 			}
411 			if (*split == ',' && !escaped)
412 				break;
413 			escaped = (*split == '\\') ? !escaped : false;
414 		}
415 
416 		/* Optimization: If match == !reverse and poison == poisoned, this
417 		   pattern can't change the result, so don't do any work. */
418 		if (match == !reverse && poison == poisoned)
419 			continue;
420 		if (match_pattern(text, p, split - 1) == true) {
421 			poisoned = poison;
422 			match = !reverse;
423 		}
424 	}
425 	if (poisoned)
426 		return UWILDMAT_POISON;
427 	return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
428 }
429 
430 
431 /*
432  **  User-level routine used for wildmats where @ should be treated as a
433  **  regular character.
434  */
435 bool
uwildmat(const char * text,const char * pat)436 uwildmat(const char *text, const char *pat)
437 {
438 	const unsigned char *utext = (const unsigned char *) text;
439 	const unsigned char *upat = (const unsigned char *) pat;
440 
441 	if (upat[0] == '*' && upat[1] == '\0')
442 		return true;
443 	else
444 		return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
445 }
446 
447 
448 /*
449  **  User-level routine used for wildmats that support poison matches.
450  */
451 enum uwildmat
uwildmat_poison(const char * text,const char * pat)452 uwildmat_poison(const char *text, const char *pat)
453 {
454 	const unsigned char *utext = (const unsigned char *) text;
455 	const unsigned char *upat = (const unsigned char *) pat;
456 
457 	if (upat[0] == '*' && upat[1] == '\0')
458 		return UWILDMAT_MATCH;
459 	else
460 		return match_expression(utext, upat, true);
461 }
462 
463 
464 /*
465  **  User-level routine for simple expressions (neither , nor ! are special).
466  */
467 bool
uwildmat_simple(const char * text,const char * pat)468 uwildmat_simple(const char *text, const char *pat)
469 {
470 	const unsigned char *utext = (const unsigned char *) text;
471 	const unsigned char *upat = (const unsigned char *) pat;
472 	size_t length;
473 
474 	if (upat[0] == '*' && upat[1] == '\0')
475 		return true;
476 	else {
477 		length = strlen(pat);
478 		return (match_pattern(utext, upat, upat + length - 1) == true);
479 	}
480 }
481