1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_STRING_SEARCH_H_
6 #define V8_STRING_SEARCH_H_
7
8 namespace v8 {
9 namespace internal {
10
11
12 //---------------------------------------------------------------------
13 // String Search object.
14 //---------------------------------------------------------------------
15
16 // Class holding constants and methods that apply to all string search variants,
17 // independently of subject and pattern char size.
18 class StringSearchBase {
19 protected:
20 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
21 // limit, we can fix the size of tables. For a needle longer than this limit,
22 // search will not be optimal, since we only build tables for a suffix
23 // of the string, but it is a safe approximation.
24 static const int kBMMaxShift = Isolate::kBMMaxShift;
25
26 // Reduce alphabet to this size.
27 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
28 // proportional to the input alphabet. We reduce the alphabet size by
29 // equating input characters modulo a smaller alphabet size. This gives
30 // a potentially less efficient searching, but is a safe approximation.
31 // For needles using only characters in the same Unicode 256-code point page,
32 // there is no search speed degradation.
33 static const int kLatin1AlphabetSize = 256;
34 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
35
36 // Bad-char shift table stored in the state. It's length is the alphabet size.
37 // For patterns below this length, the skip length of Boyer-Moore is too short
38 // to compensate for the algorithmic overhead compared to simple brute force.
39 static const int kBMMinPatternLength = 7;
40
IsOneByteString(Vector<const uint8_t> string)41 static inline bool IsOneByteString(Vector<const uint8_t> string) {
42 return true;
43 }
44
IsOneByteString(Vector<const uc16> string)45 static inline bool IsOneByteString(Vector<const uc16> string) {
46 return String::IsOneByte(string.start(), string.length());
47 }
48
49 friend class Isolate;
50 };
51
52
53 template <typename PatternChar, typename SubjectChar>
54 class StringSearch : private StringSearchBase {
55 public:
StringSearch(Isolate * isolate,Vector<const PatternChar> pattern)56 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
57 : isolate_(isolate),
58 pattern_(pattern),
59 start_(Max(0, pattern.length() - kBMMaxShift)) {
60 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
61 if (!IsOneByteString(pattern_)) {
62 strategy_ = &FailSearch;
63 return;
64 }
65 }
66 int pattern_length = pattern_.length();
67 if (pattern_length < kBMMinPatternLength) {
68 if (pattern_length == 1) {
69 strategy_ = &SingleCharSearch;
70 return;
71 }
72 strategy_ = &LinearSearch;
73 return;
74 }
75 strategy_ = &InitialSearch;
76 }
77
Search(Vector<const SubjectChar> subject,int index)78 int Search(Vector<const SubjectChar> subject, int index) {
79 return strategy_(this, subject, index);
80 }
81
AlphabetSize()82 static inline int AlphabetSize() {
83 if (sizeof(PatternChar) == 1) {
84 // Latin1 needle.
85 return kLatin1AlphabetSize;
86 } else {
87 DCHECK(sizeof(PatternChar) == 2);
88 // UC16 needle.
89 return kUC16AlphabetSize;
90 }
91 }
92
93 private:
94 typedef int (*SearchFunction)( // NOLINT - it's not a cast!
95 StringSearch<PatternChar, SubjectChar>*,
96 Vector<const SubjectChar>,
97 int);
98
FailSearch(StringSearch<PatternChar,SubjectChar> *,Vector<const SubjectChar>,int)99 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100 Vector<const SubjectChar>,
101 int) {
102 return -1;
103 }
104
105 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
106 Vector<const SubjectChar> subject,
107 int start_index);
108
109 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
110 Vector<const SubjectChar> subject,
111 int start_index);
112
113 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
114 Vector<const SubjectChar> subject,
115 int start_index);
116
117 static int BoyerMooreHorspoolSearch(
118 StringSearch<PatternChar, SubjectChar>* search,
119 Vector<const SubjectChar> subject,
120 int start_index);
121
122 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
123 Vector<const SubjectChar> subject,
124 int start_index);
125
126 void PopulateBoyerMooreHorspoolTable();
127
128 void PopulateBoyerMooreTable();
129
exceedsOneByte(uint8_t c)130 static inline bool exceedsOneByte(uint8_t c) {
131 return false;
132 }
133
exceedsOneByte(uint16_t c)134 static inline bool exceedsOneByte(uint16_t c) {
135 return c > String::kMaxOneByteCharCodeU;
136 }
137
CharOccurrence(int * bad_char_occurrence,SubjectChar char_code)138 static inline int CharOccurrence(int* bad_char_occurrence,
139 SubjectChar char_code) {
140 if (sizeof(SubjectChar) == 1) {
141 return bad_char_occurrence[static_cast<int>(char_code)];
142 }
143 if (sizeof(PatternChar) == 1) {
144 if (exceedsOneByte(char_code)) {
145 return -1;
146 }
147 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
148 }
149 // Both pattern and subject are UC16. Reduce character to equivalence class.
150 int equiv_class = char_code % kUC16AlphabetSize;
151 return bad_char_occurrence[equiv_class];
152 }
153
154 // The following tables are shared by all searches.
155 // TODO(lrn): Introduce a way for a pattern to keep its tables
156 // between searches (e.g., for an Atom RegExp).
157
158 // Store for the BoyerMoore(Horspool) bad char shift table.
159 // Return a table covering the last kBMMaxShift+1 positions of
160 // pattern.
bad_char_table()161 int* bad_char_table() {
162 return isolate_->bad_char_shift_table();
163 }
164
165 // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()166 int* good_suffix_shift_table() {
167 // Return biased pointer that maps the range [start_..pattern_.length()
168 // to the kGoodSuffixShiftTable array.
169 return isolate_->good_suffix_shift_table() - start_;
170 }
171
172 // Table used temporarily while building the BoyerMoore good suffix
173 // shift table.
suffix_table()174 int* suffix_table() {
175 // Return biased pointer that maps the range [start_..pattern_.length()
176 // to the kSuffixTable array.
177 return isolate_->suffix_table() - start_;
178 }
179
180 Isolate* isolate_;
181 // The pattern to search for.
182 Vector<const PatternChar> pattern_;
183 // Pointer to implementation of the search.
184 SearchFunction strategy_;
185 // Cache value of Max(0, pattern_length() - kBMMaxShift)
186 int start_;
187 };
188
189
190 //---------------------------------------------------------------------
191 // Single Character Pattern Search Strategy
192 //---------------------------------------------------------------------
193
194 template <typename PatternChar, typename SubjectChar>
SingleCharSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)195 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
196 StringSearch<PatternChar, SubjectChar>* search,
197 Vector<const SubjectChar> subject,
198 int index) {
199 DCHECK_EQ(1, search->pattern_.length());
200 PatternChar pattern_first_char = search->pattern_[0];
201 int i = index;
202 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
203 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
204 memchr(subject.start() + i,
205 pattern_first_char,
206 subject.length() - i));
207 if (pos == NULL) return -1;
208 return static_cast<int>(pos - subject.start());
209 } else {
210 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
211 if (exceedsOneByte(pattern_first_char)) {
212 return -1;
213 }
214 }
215 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216 int n = subject.length();
217 while (i < n) {
218 if (subject[i++] == search_char) return i - 1;
219 }
220 return -1;
221 }
222 }
223
224 //---------------------------------------------------------------------
225 // Linear Search Strategy
226 //---------------------------------------------------------------------
227
228
229 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar * pattern,const SubjectChar * subject,int length)230 inline bool CharCompare(const PatternChar* pattern,
231 const SubjectChar* subject,
232 int length) {
233 DCHECK(length > 0);
234 int pos = 0;
235 do {
236 if (pattern[pos] != subject[pos]) {
237 return false;
238 }
239 pos++;
240 } while (pos < length);
241 return true;
242 }
243
244
245 // Simple linear search for short patterns. Never bails out.
246 template <typename PatternChar, typename SubjectChar>
LinearSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)247 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
248 StringSearch<PatternChar, SubjectChar>* search,
249 Vector<const SubjectChar> subject,
250 int index) {
251 Vector<const PatternChar> pattern = search->pattern_;
252 DCHECK(pattern.length() > 1);
253 int pattern_length = pattern.length();
254 PatternChar pattern_first_char = pattern[0];
255 int i = index;
256 int n = subject.length() - pattern_length;
257 while (i <= n) {
258 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
259 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
260 memchr(subject.start() + i,
261 pattern_first_char,
262 n - i + 1));
263 if (pos == NULL) return -1;
264 i = static_cast<int>(pos - subject.start()) + 1;
265 } else {
266 if (subject[i++] != pattern_first_char) continue;
267 }
268 // Loop extracted to separate function to allow using return to do
269 // a deeper break.
270 if (CharCompare(pattern.start() + 1,
271 subject.start() + i,
272 pattern_length - 1)) {
273 return i - 1;
274 }
275 }
276 return -1;
277 }
278
279 //---------------------------------------------------------------------
280 // Boyer-Moore string search
281 //---------------------------------------------------------------------
282
283 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)284 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
285 StringSearch<PatternChar, SubjectChar>* search,
286 Vector<const SubjectChar> subject,
287 int start_index) {
288 Vector<const PatternChar> pattern = search->pattern_;
289 int subject_length = subject.length();
290 int pattern_length = pattern.length();
291 // Only preprocess at most kBMMaxShift last characters of pattern.
292 int start = search->start_;
293
294 int* bad_char_occurence = search->bad_char_table();
295 int* good_suffix_shift = search->good_suffix_shift_table();
296
297 PatternChar last_char = pattern[pattern_length - 1];
298 int index = start_index;
299 // Continue search from i.
300 while (index <= subject_length - pattern_length) {
301 int j = pattern_length - 1;
302 int c;
303 while (last_char != (c = subject[index + j])) {
304 int shift =
305 j - CharOccurrence(bad_char_occurence, c);
306 index += shift;
307 if (index > subject_length - pattern_length) {
308 return -1;
309 }
310 }
311 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
312 if (j < 0) {
313 return index;
314 } else if (j < start) {
315 // we have matched more than our tables allow us to be smart about.
316 // Fall back on BMH shift.
317 index += pattern_length - 1
318 - CharOccurrence(bad_char_occurence,
319 static_cast<SubjectChar>(last_char));
320 } else {
321 int gs_shift = good_suffix_shift[j + 1];
322 int bc_occ =
323 CharOccurrence(bad_char_occurence, c);
324 int shift = j - bc_occ;
325 if (gs_shift > shift) {
326 shift = gs_shift;
327 }
328 index += shift;
329 }
330 }
331
332 return -1;
333 }
334
335
336 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()337 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
338 int pattern_length = pattern_.length();
339 const PatternChar* pattern = pattern_.start();
340 // Only look at the last kBMMaxShift characters of pattern (from start_
341 // to pattern_length).
342 int start = start_;
343 int length = pattern_length - start;
344
345 // Biased tables so that we can use pattern indices as table indices,
346 // even if we only cover the part of the pattern from offset start.
347 int* shift_table = good_suffix_shift_table();
348 int* suffix_table = this->suffix_table();
349
350 // Initialize table.
351 for (int i = start; i < pattern_length; i++) {
352 shift_table[i] = length;
353 }
354 shift_table[pattern_length] = 1;
355 suffix_table[pattern_length] = pattern_length + 1;
356
357 if (pattern_length <= start) {
358 return;
359 }
360
361 // Find suffixes.
362 PatternChar last_char = pattern[pattern_length - 1];
363 int suffix = pattern_length + 1;
364 {
365 int i = pattern_length;
366 while (i > start) {
367 PatternChar c = pattern[i - 1];
368 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
369 if (shift_table[suffix] == length) {
370 shift_table[suffix] = suffix - i;
371 }
372 suffix = suffix_table[suffix];
373 }
374 suffix_table[--i] = --suffix;
375 if (suffix == pattern_length) {
376 // No suffix to extend, so we check against last_char only.
377 while ((i > start) && (pattern[i - 1] != last_char)) {
378 if (shift_table[pattern_length] == length) {
379 shift_table[pattern_length] = pattern_length - i;
380 }
381 suffix_table[--i] = pattern_length;
382 }
383 if (i > start) {
384 suffix_table[--i] = --suffix;
385 }
386 }
387 }
388 }
389 // Build shift table using suffixes.
390 if (suffix < pattern_length) {
391 for (int i = start; i <= pattern_length; i++) {
392 if (shift_table[i] == length) {
393 shift_table[i] = suffix - start;
394 }
395 if (i == suffix) {
396 suffix = suffix_table[suffix];
397 }
398 }
399 }
400 }
401
402 //---------------------------------------------------------------------
403 // Boyer-Moore-Horspool string search.
404 //---------------------------------------------------------------------
405
406 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)407 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
408 StringSearch<PatternChar, SubjectChar>* search,
409 Vector<const SubjectChar> subject,
410 int start_index) {
411 Vector<const PatternChar> pattern = search->pattern_;
412 int subject_length = subject.length();
413 int pattern_length = pattern.length();
414 int* char_occurrences = search->bad_char_table();
415 int badness = -pattern_length;
416
417 // How bad we are doing without a good-suffix table.
418 PatternChar last_char = pattern[pattern_length - 1];
419 int last_char_shift = pattern_length - 1 -
420 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
421 // Perform search
422 int index = start_index; // No matches found prior to this index.
423 while (index <= subject_length - pattern_length) {
424 int j = pattern_length - 1;
425 int subject_char;
426 while (last_char != (subject_char = subject[index + j])) {
427 int bc_occ = CharOccurrence(char_occurrences, subject_char);
428 int shift = j - bc_occ;
429 index += shift;
430 badness += 1 - shift; // at most zero, so badness cannot increase.
431 if (index > subject_length - pattern_length) {
432 return -1;
433 }
434 }
435 j--;
436 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
437 if (j < 0) {
438 return index;
439 } else {
440 index += last_char_shift;
441 // Badness increases by the number of characters we have
442 // checked, and decreases by the number of characters we
443 // can skip by shifting. It's a measure of how we are doing
444 // compared to reading each character exactly once.
445 badness += (pattern_length - j) - last_char_shift;
446 if (badness > 0) {
447 search->PopulateBoyerMooreTable();
448 search->strategy_ = &BoyerMooreSearch;
449 return BoyerMooreSearch(search, subject, index);
450 }
451 }
452 }
453 return -1;
454 }
455
456
457 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()458 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
459 int pattern_length = pattern_.length();
460
461 int* bad_char_occurrence = bad_char_table();
462
463 // Only preprocess at most kBMMaxShift last characters of pattern.
464 int start = start_;
465 // Run forwards to populate bad_char_table, so that *last* instance
466 // of character equivalence class is the one registered.
467 // Notice: Doesn't include the last character.
468 int table_size = AlphabetSize();
469 if (start == 0) { // All patterns less than kBMMaxShift in length.
470 memset(bad_char_occurrence,
471 -1,
472 table_size * sizeof(*bad_char_occurrence));
473 } else {
474 for (int i = 0; i < table_size; i++) {
475 bad_char_occurrence[i] = start - 1;
476 }
477 }
478 for (int i = start; i < pattern_length - 1; i++) {
479 PatternChar c = pattern_[i];
480 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481 bad_char_occurrence[bucket] = i;
482 }
483 }
484
485 //---------------------------------------------------------------------
486 // Linear string search with bailout to BMH.
487 //---------------------------------------------------------------------
488
489 // Simple linear search for short patterns, which bails out if the string
490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
491 template <typename PatternChar, typename SubjectChar>
InitialSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)492 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
493 StringSearch<PatternChar, SubjectChar>* search,
494 Vector<const SubjectChar> subject,
495 int index) {
496 Vector<const PatternChar> pattern = search->pattern_;
497 int pattern_length = pattern.length();
498 // Badness is a count of how much work we have done. When we have
499 // done enough work we decide it's probably worth switching to a better
500 // algorithm.
501 int badness = -10 - (pattern_length << 2);
502
503 // We know our pattern is at least 2 characters, we cache the first so
504 // the common case of the first character not matching is faster.
505 PatternChar pattern_first_char = pattern[0];
506 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
507 badness++;
508 if (badness <= 0) {
509 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
510 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
511 memchr(subject.start() + i,
512 pattern_first_char,
513 n - i + 1));
514 if (pos == NULL) {
515 return -1;
516 }
517 i = static_cast<int>(pos - subject.start());
518 } else {
519 if (subject[i] != pattern_first_char) continue;
520 }
521 int j = 1;
522 do {
523 if (pattern[j] != subject[i + j]) {
524 break;
525 }
526 j++;
527 } while (j < pattern_length);
528 if (j == pattern_length) {
529 return i;
530 }
531 badness += j;
532 } else {
533 search->PopulateBoyerMooreHorspoolTable();
534 search->strategy_ = &BoyerMooreHorspoolSearch;
535 return BoyerMooreHorspoolSearch(search, subject, i);
536 }
537 }
538 return -1;
539 }
540
541
542 // Perform a a single stand-alone search.
543 // If searching multiple times for the same pattern, a search
544 // object should be constructed once and the Search function then called
545 // for each search.
546 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate * isolate,Vector<const SubjectChar> subject,Vector<const PatternChar> pattern,int start_index)547 int SearchString(Isolate* isolate,
548 Vector<const SubjectChar> subject,
549 Vector<const PatternChar> pattern,
550 int start_index) {
551 StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
552 return search.Search(subject, start_index);
553 }
554
555 }} // namespace v8::internal
556
557 #endif // V8_STRING_SEARCH_H_
558