1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRING_SEARCH_H_
6 #define V8_STRING_SEARCH_H_
7 
8 namespace v8 {
9 namespace internal {
10 
11 
12 //---------------------------------------------------------------------
13 // String Search object.
14 //---------------------------------------------------------------------
15 
16 // Class holding constants and methods that apply to all string search variants,
17 // independently of subject and pattern char size.
18 class StringSearchBase {
19  protected:
20   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
21   // limit, we can fix the size of tables. For a needle longer than this limit,
22   // search will not be optimal, since we only build tables for a suffix
23   // of the string, but it is a safe approximation.
24   static const int kBMMaxShift = Isolate::kBMMaxShift;
25 
26   // Reduce alphabet to this size.
27   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
28   // proportional to the input alphabet. We reduce the alphabet size by
29   // equating input characters modulo a smaller alphabet size. This gives
30   // a potentially less efficient searching, but is a safe approximation.
31   // For needles using only characters in the same Unicode 256-code point page,
32   // there is no search speed degradation.
33   static const int kLatin1AlphabetSize = 256;
34   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
35 
36   // Bad-char shift table stored in the state. It's length is the alphabet size.
37   // For patterns below this length, the skip length of Boyer-Moore is too short
38   // to compensate for the algorithmic overhead compared to simple brute force.
39   static const int kBMMinPatternLength = 7;
40 
IsOneByteString(Vector<const uint8_t> string)41   static inline bool IsOneByteString(Vector<const uint8_t> string) {
42     return true;
43   }
44 
IsOneByteString(Vector<const uc16> string)45   static inline bool IsOneByteString(Vector<const uc16> string) {
46     return String::IsOneByte(string.start(), string.length());
47   }
48 
49   friend class Isolate;
50 };
51 
52 
53 template <typename PatternChar, typename SubjectChar>
54 class StringSearch : private StringSearchBase {
55  public:
StringSearch(Isolate * isolate,Vector<const PatternChar> pattern)56   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
57       : isolate_(isolate),
58         pattern_(pattern),
59         start_(Max(0, pattern.length() - kBMMaxShift)) {
60     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
61       if (!IsOneByteString(pattern_)) {
62         strategy_ = &FailSearch;
63         return;
64       }
65     }
66     int pattern_length = pattern_.length();
67     if (pattern_length < kBMMinPatternLength) {
68       if (pattern_length == 1) {
69         strategy_ = &SingleCharSearch;
70         return;
71       }
72       strategy_ = &LinearSearch;
73       return;
74     }
75     strategy_ = &InitialSearch;
76   }
77 
Search(Vector<const SubjectChar> subject,int index)78   int Search(Vector<const SubjectChar> subject, int index) {
79     return strategy_(this, subject, index);
80   }
81 
AlphabetSize()82   static inline int AlphabetSize() {
83     if (sizeof(PatternChar) == 1) {
84       // Latin1 needle.
85       return kLatin1AlphabetSize;
86     } else {
87       DCHECK(sizeof(PatternChar) == 2);
88       // UC16 needle.
89       return kUC16AlphabetSize;
90     }
91   }
92 
93  private:
94   typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
95       StringSearch<PatternChar, SubjectChar>*,
96       Vector<const SubjectChar>,
97       int);
98 
FailSearch(StringSearch<PatternChar,SubjectChar> *,Vector<const SubjectChar>,int)99   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100                         Vector<const SubjectChar>,
101                         int) {
102     return -1;
103   }
104 
105   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
106                               Vector<const SubjectChar> subject,
107                               int start_index);
108 
109   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
110                           Vector<const SubjectChar> subject,
111                           int start_index);
112 
113   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
114                            Vector<const SubjectChar> subject,
115                            int start_index);
116 
117   static int BoyerMooreHorspoolSearch(
118       StringSearch<PatternChar, SubjectChar>* search,
119       Vector<const SubjectChar> subject,
120       int start_index);
121 
122   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
123                               Vector<const SubjectChar> subject,
124                               int start_index);
125 
126   void PopulateBoyerMooreHorspoolTable();
127 
128   void PopulateBoyerMooreTable();
129 
exceedsOneByte(uint8_t c)130   static inline bool exceedsOneByte(uint8_t c) {
131     return false;
132   }
133 
exceedsOneByte(uint16_t c)134   static inline bool exceedsOneByte(uint16_t c) {
135     return c > String::kMaxOneByteCharCodeU;
136   }
137 
CharOccurrence(int * bad_char_occurrence,SubjectChar char_code)138   static inline int CharOccurrence(int* bad_char_occurrence,
139                                    SubjectChar char_code) {
140     if (sizeof(SubjectChar) == 1) {
141       return bad_char_occurrence[static_cast<int>(char_code)];
142     }
143     if (sizeof(PatternChar) == 1) {
144       if (exceedsOneByte(char_code)) {
145         return -1;
146       }
147       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
148     }
149     // Both pattern and subject are UC16. Reduce character to equivalence class.
150     int equiv_class = char_code % kUC16AlphabetSize;
151     return bad_char_occurrence[equiv_class];
152   }
153 
154   // The following tables are shared by all searches.
155   // TODO(lrn): Introduce a way for a pattern to keep its tables
156   // between searches (e.g., for an Atom RegExp).
157 
158   // Store for the BoyerMoore(Horspool) bad char shift table.
159   // Return a table covering the last kBMMaxShift+1 positions of
160   // pattern.
bad_char_table()161   int* bad_char_table() {
162     return isolate_->bad_char_shift_table();
163   }
164 
165   // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()166   int* good_suffix_shift_table() {
167     // Return biased pointer that maps the range  [start_..pattern_.length()
168     // to the kGoodSuffixShiftTable array.
169     return isolate_->good_suffix_shift_table() - start_;
170   }
171 
172   // Table used temporarily while building the BoyerMoore good suffix
173   // shift table.
suffix_table()174   int* suffix_table() {
175     // Return biased pointer that maps the range  [start_..pattern_.length()
176     // to the kSuffixTable array.
177     return isolate_->suffix_table() - start_;
178   }
179 
180   Isolate* isolate_;
181   // The pattern to search for.
182   Vector<const PatternChar> pattern_;
183   // Pointer to implementation of the search.
184   SearchFunction strategy_;
185   // Cache value of Max(0, pattern_length() - kBMMaxShift)
186   int start_;
187 };
188 
189 
190 //---------------------------------------------------------------------
191 // Single Character Pattern Search Strategy
192 //---------------------------------------------------------------------
193 
194 template <typename PatternChar, typename SubjectChar>
SingleCharSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)195 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
196     StringSearch<PatternChar, SubjectChar>* search,
197     Vector<const SubjectChar> subject,
198     int index) {
199   DCHECK_EQ(1, search->pattern_.length());
200   PatternChar pattern_first_char = search->pattern_[0];
201   int i = index;
202   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
203     const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
204         memchr(subject.start() + i,
205                pattern_first_char,
206                subject.length() - i));
207     if (pos == NULL) return -1;
208     return static_cast<int>(pos - subject.start());
209   } else {
210     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
211       if (exceedsOneByte(pattern_first_char)) {
212         return -1;
213       }
214     }
215     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216     int n = subject.length();
217     while (i < n) {
218       if (subject[i++] == search_char) return i - 1;
219     }
220     return -1;
221   }
222 }
223 
224 //---------------------------------------------------------------------
225 // Linear Search Strategy
226 //---------------------------------------------------------------------
227 
228 
229 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar * pattern,const SubjectChar * subject,int length)230 inline bool CharCompare(const PatternChar* pattern,
231                         const SubjectChar* subject,
232                         int length) {
233   DCHECK(length > 0);
234   int pos = 0;
235   do {
236     if (pattern[pos] != subject[pos]) {
237       return false;
238     }
239     pos++;
240   } while (pos < length);
241   return true;
242 }
243 
244 
245 // Simple linear search for short patterns. Never bails out.
246 template <typename PatternChar, typename SubjectChar>
LinearSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)247 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
248     StringSearch<PatternChar, SubjectChar>* search,
249     Vector<const SubjectChar> subject,
250     int index) {
251   Vector<const PatternChar> pattern = search->pattern_;
252   DCHECK(pattern.length() > 1);
253   int pattern_length = pattern.length();
254   PatternChar pattern_first_char = pattern[0];
255   int i = index;
256   int n = subject.length() - pattern_length;
257   while (i <= n) {
258     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
259       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
260           memchr(subject.start() + i,
261                  pattern_first_char,
262                  n - i + 1));
263       if (pos == NULL) return -1;
264       i = static_cast<int>(pos - subject.start()) + 1;
265     } else {
266       if (subject[i++] != pattern_first_char) continue;
267     }
268     // Loop extracted to separate function to allow using return to do
269     // a deeper break.
270     if (CharCompare(pattern.start() + 1,
271                     subject.start() + i,
272                     pattern_length - 1)) {
273       return i - 1;
274     }
275   }
276   return -1;
277 }
278 
279 //---------------------------------------------------------------------
280 // Boyer-Moore string search
281 //---------------------------------------------------------------------
282 
283 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)284 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
285     StringSearch<PatternChar, SubjectChar>* search,
286     Vector<const SubjectChar> subject,
287     int start_index) {
288   Vector<const PatternChar> pattern = search->pattern_;
289   int subject_length = subject.length();
290   int pattern_length = pattern.length();
291   // Only preprocess at most kBMMaxShift last characters of pattern.
292   int start = search->start_;
293 
294   int* bad_char_occurence = search->bad_char_table();
295   int* good_suffix_shift = search->good_suffix_shift_table();
296 
297   PatternChar last_char = pattern[pattern_length - 1];
298   int index = start_index;
299   // Continue search from i.
300   while (index <= subject_length - pattern_length) {
301     int j = pattern_length - 1;
302     int c;
303     while (last_char != (c = subject[index + j])) {
304       int shift =
305           j - CharOccurrence(bad_char_occurence, c);
306       index += shift;
307       if (index > subject_length - pattern_length) {
308         return -1;
309       }
310     }
311     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
312     if (j < 0) {
313       return index;
314     } else if (j < start) {
315       // we have matched more than our tables allow us to be smart about.
316       // Fall back on BMH shift.
317       index += pattern_length - 1
318           - CharOccurrence(bad_char_occurence,
319                            static_cast<SubjectChar>(last_char));
320     } else {
321       int gs_shift = good_suffix_shift[j + 1];
322       int bc_occ =
323           CharOccurrence(bad_char_occurence, c);
324       int shift = j - bc_occ;
325       if (gs_shift > shift) {
326         shift = gs_shift;
327       }
328       index += shift;
329     }
330   }
331 
332   return -1;
333 }
334 
335 
336 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()337 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
338   int pattern_length = pattern_.length();
339   const PatternChar* pattern = pattern_.start();
340   // Only look at the last kBMMaxShift characters of pattern (from start_
341   // to pattern_length).
342   int start = start_;
343   int length = pattern_length - start;
344 
345   // Biased tables so that we can use pattern indices as table indices,
346   // even if we only cover the part of the pattern from offset start.
347   int* shift_table = good_suffix_shift_table();
348   int* suffix_table = this->suffix_table();
349 
350   // Initialize table.
351   for (int i = start; i < pattern_length; i++) {
352     shift_table[i] = length;
353   }
354   shift_table[pattern_length] = 1;
355   suffix_table[pattern_length] = pattern_length + 1;
356 
357   if (pattern_length <= start) {
358     return;
359   }
360 
361   // Find suffixes.
362   PatternChar last_char = pattern[pattern_length - 1];
363   int suffix = pattern_length + 1;
364   {
365     int i = pattern_length;
366     while (i > start) {
367       PatternChar c = pattern[i - 1];
368       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
369         if (shift_table[suffix] == length) {
370           shift_table[suffix] = suffix - i;
371         }
372         suffix = suffix_table[suffix];
373       }
374       suffix_table[--i] = --suffix;
375       if (suffix == pattern_length) {
376         // No suffix to extend, so we check against last_char only.
377         while ((i > start) && (pattern[i - 1] != last_char)) {
378           if (shift_table[pattern_length] == length) {
379             shift_table[pattern_length] = pattern_length - i;
380           }
381           suffix_table[--i] = pattern_length;
382         }
383         if (i > start) {
384           suffix_table[--i] = --suffix;
385         }
386       }
387     }
388   }
389   // Build shift table using suffixes.
390   if (suffix < pattern_length) {
391     for (int i = start; i <= pattern_length; i++) {
392       if (shift_table[i] == length) {
393         shift_table[i] = suffix - start;
394       }
395       if (i == suffix) {
396         suffix = suffix_table[suffix];
397       }
398     }
399   }
400 }
401 
402 //---------------------------------------------------------------------
403 // Boyer-Moore-Horspool string search.
404 //---------------------------------------------------------------------
405 
406 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)407 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
408     StringSearch<PatternChar, SubjectChar>* search,
409     Vector<const SubjectChar> subject,
410     int start_index) {
411   Vector<const PatternChar> pattern = search->pattern_;
412   int subject_length = subject.length();
413   int pattern_length = pattern.length();
414   int* char_occurrences = search->bad_char_table();
415   int badness = -pattern_length;
416 
417   // How bad we are doing without a good-suffix table.
418   PatternChar last_char = pattern[pattern_length - 1];
419   int last_char_shift = pattern_length - 1 -
420       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
421   // Perform search
422   int index = start_index;  // No matches found prior to this index.
423   while (index <= subject_length - pattern_length) {
424     int j = pattern_length - 1;
425     int subject_char;
426     while (last_char != (subject_char = subject[index + j])) {
427       int bc_occ = CharOccurrence(char_occurrences, subject_char);
428       int shift = j - bc_occ;
429       index += shift;
430       badness += 1 - shift;  // at most zero, so badness cannot increase.
431       if (index > subject_length - pattern_length) {
432         return -1;
433       }
434     }
435     j--;
436     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
437     if (j < 0) {
438       return index;
439     } else {
440       index += last_char_shift;
441       // Badness increases by the number of characters we have
442       // checked, and decreases by the number of characters we
443       // can skip by shifting. It's a measure of how we are doing
444       // compared to reading each character exactly once.
445       badness += (pattern_length - j) - last_char_shift;
446       if (badness > 0) {
447         search->PopulateBoyerMooreTable();
448         search->strategy_ = &BoyerMooreSearch;
449         return BoyerMooreSearch(search, subject, index);
450       }
451     }
452   }
453   return -1;
454 }
455 
456 
457 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()458 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
459   int pattern_length = pattern_.length();
460 
461   int* bad_char_occurrence = bad_char_table();
462 
463   // Only preprocess at most kBMMaxShift last characters of pattern.
464   int start = start_;
465   // Run forwards to populate bad_char_table, so that *last* instance
466   // of character equivalence class is the one registered.
467   // Notice: Doesn't include the last character.
468   int table_size = AlphabetSize();
469   if (start == 0) {  // All patterns less than kBMMaxShift in length.
470     memset(bad_char_occurrence,
471            -1,
472            table_size * sizeof(*bad_char_occurrence));
473   } else {
474     for (int i = 0; i < table_size; i++) {
475       bad_char_occurrence[i] = start - 1;
476     }
477   }
478   for (int i = start; i < pattern_length - 1; i++) {
479     PatternChar c = pattern_[i];
480     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481     bad_char_occurrence[bucket] = i;
482   }
483 }
484 
485 //---------------------------------------------------------------------
486 // Linear string search with bailout to BMH.
487 //---------------------------------------------------------------------
488 
489 // Simple linear search for short patterns, which bails out if the string
490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
491 template <typename PatternChar, typename SubjectChar>
InitialSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)492 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
493     StringSearch<PatternChar, SubjectChar>* search,
494     Vector<const SubjectChar> subject,
495     int index) {
496   Vector<const PatternChar> pattern = search->pattern_;
497   int pattern_length = pattern.length();
498   // Badness is a count of how much work we have done.  When we have
499   // done enough work we decide it's probably worth switching to a better
500   // algorithm.
501   int badness = -10 - (pattern_length << 2);
502 
503   // We know our pattern is at least 2 characters, we cache the first so
504   // the common case of the first character not matching is faster.
505   PatternChar pattern_first_char = pattern[0];
506   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
507     badness++;
508     if (badness <= 0) {
509       if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
510         const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
511             memchr(subject.start() + i,
512                    pattern_first_char,
513                    n - i + 1));
514         if (pos == NULL) {
515           return -1;
516         }
517         i = static_cast<int>(pos - subject.start());
518       } else {
519         if (subject[i] != pattern_first_char) continue;
520       }
521       int j = 1;
522       do {
523         if (pattern[j] != subject[i + j]) {
524           break;
525         }
526         j++;
527       } while (j < pattern_length);
528       if (j == pattern_length) {
529         return i;
530       }
531       badness += j;
532     } else {
533       search->PopulateBoyerMooreHorspoolTable();
534       search->strategy_ = &BoyerMooreHorspoolSearch;
535       return BoyerMooreHorspoolSearch(search, subject, i);
536     }
537   }
538   return -1;
539 }
540 
541 
542 // Perform a a single stand-alone search.
543 // If searching multiple times for the same pattern, a search
544 // object should be constructed once and the Search function then called
545 // for each search.
546 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate * isolate,Vector<const SubjectChar> subject,Vector<const PatternChar> pattern,int start_index)547 int SearchString(Isolate* isolate,
548                  Vector<const SubjectChar> subject,
549                  Vector<const PatternChar> pattern,
550                  int start_index) {
551   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
552   return search.Search(subject, start_index);
553 }
554 
555 }}  // namespace v8::internal
556 
557 #endif  // V8_STRING_SEARCH_H_
558