V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
string-search.h
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRING_SEARCH_H_
6 #define V8_STRING_SEARCH_H_
7 
8 #include "src/isolate.h"
9 #include "src/vector.h"
10 
11 namespace v8 {
12 namespace internal {
13 
14 
15 //---------------------------------------------------------------------
16 // String Search object.
17 //---------------------------------------------------------------------
18 
19 // Class holding constants and methods that apply to all string search variants,
20 // independently of subject and pattern char size.
22  protected:
23  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24  // limit, we can fix the size of tables. For a needle longer than this limit,
25  // search will not be optimal, since we only build tables for a suffix
26  // of the string, but it is a safe approximation.
27  static const int kBMMaxShift = Isolate::kBMMaxShift;
28 
29  // Reduce alphabet to this size.
30  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31  // proportional to the input alphabet. We reduce the alphabet size by
32  // equating input characters modulo a smaller alphabet size. This gives
33  // a potentially less efficient searching, but is a safe approximation.
34  // For needles using only characters in the same Unicode 256-code point page,
35  // there is no search speed degradation.
36  static const int kLatin1AlphabetSize = 256;
37  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38 
39  // Bad-char shift table stored in the state. It's length is the alphabet size.
40  // For patterns below this length, the skip length of Boyer-Moore is too short
41  // to compensate for the algorithmic overhead compared to simple brute force.
42  static const int kBMMinPatternLength = 7;
43 
44  static inline bool IsOneByteString(Vector<const uint8_t> string) {
45  return true;
46  }
47 
48  static inline bool IsOneByteString(Vector<const uc16> string) {
49  return String::IsOneByte(string.start(), string.length());
50  }
51 
52  friend class Isolate;
53 };
54 
55 
56 template <typename PatternChar, typename SubjectChar>
57 class StringSearch : private StringSearchBase {
58  public:
60  : isolate_(isolate),
61  pattern_(pattern),
62  start_(Max(0, pattern.length() - kBMMaxShift)) {
63  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
64  if (!IsOneByteString(pattern_)) {
65  strategy_ = &FailSearch;
66  return;
67  }
68  }
69  int pattern_length = pattern_.length();
70  if (pattern_length < kBMMinPatternLength) {
71  if (pattern_length == 1) {
72  strategy_ = &SingleCharSearch;
73  return;
74  }
75  strategy_ = &LinearSearch;
76  return;
77  }
78  strategy_ = &InitialSearch;
79  }
80 
81  int Search(Vector<const SubjectChar> subject, int index) {
82  return strategy_(this, subject, index);
83  }
84 
85  static inline int AlphabetSize() {
86  if (sizeof(PatternChar) == 1) {
87  // Latin1 needle.
88  return kLatin1AlphabetSize;
89  } else {
90  DCHECK_EQ(sizeof(PatternChar), 2);
91  // UC16 needle.
92  return kUC16AlphabetSize;
93  }
94  }
95 
96  private:
97  typedef int (*SearchFunction)( // NOLINT - it's not a cast!
100  int);
101 
102  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
104  int) {
105  return -1;
106  }
107 
108  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
110  int start_index);
111 
112  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
114  int start_index);
115 
116  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
118  int start_index);
119 
120  static int BoyerMooreHorspoolSearch(
123  int start_index);
124 
125  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
127  int start_index);
128 
129  void PopulateBoyerMooreHorspoolTable();
130 
131  void PopulateBoyerMooreTable();
132 
133  static inline bool exceedsOneByte(uint8_t c) {
134  return false;
135  }
136 
137  static inline bool exceedsOneByte(uint16_t c) {
138  return c > String::kMaxOneByteCharCodeU;
139  }
140 
141  static inline int CharOccurrence(int* bad_char_occurrence,
142  SubjectChar char_code) {
143  if (sizeof(SubjectChar) == 1) {
144  return bad_char_occurrence[static_cast<int>(char_code)];
145  }
146  if (sizeof(PatternChar) == 1) {
147  if (exceedsOneByte(char_code)) {
148  return -1;
149  }
150  return bad_char_occurrence[static_cast<unsigned int>(char_code)];
151  }
152  // Both pattern and subject are UC16. Reduce character to equivalence class.
153  int equiv_class = char_code % kUC16AlphabetSize;
154  return bad_char_occurrence[equiv_class];
155  }
156 
157  // The following tables are shared by all searches.
158  // TODO(lrn): Introduce a way for a pattern to keep its tables
159  // between searches (e.g., for an Atom RegExp).
160 
161  // Store for the BoyerMoore(Horspool) bad char shift table.
162  // Return a table covering the last kBMMaxShift+1 positions of
163  // pattern.
164  int* bad_char_table() {
165  return isolate_->bad_char_shift_table();
166  }
167 
168  // Store for the BoyerMoore good suffix shift table.
169  int* good_suffix_shift_table() {
170  // Return biased pointer that maps the range [start_..pattern_.length()
171  // to the kGoodSuffixShiftTable array.
172  return isolate_->good_suffix_shift_table() - start_;
173  }
174 
175  // Table used temporarily while building the BoyerMoore good suffix
176  // shift table.
177  int* suffix_table() {
178  // Return biased pointer that maps the range [start_..pattern_.length()
179  // to the kSuffixTable array.
180  return isolate_->suffix_table() - start_;
181  }
182 
183  Isolate* isolate_;
184  // The pattern to search for.
185  Vector<const PatternChar> pattern_;
186  // Pointer to implementation of the search.
187  SearchFunction strategy_;
188  // Cache value of Max(0, pattern_length() - kBMMaxShift)
189  int start_;
190 };
191 
192 
193 template <typename T, typename U>
194 inline T AlignDown(T value, U alignment) {
195  return reinterpret_cast<T>(
196  (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
197 }
198 
199 
200 inline uint8_t GetHighestValueByte(uc16 character) {
201  return Max(static_cast<uint8_t>(character & 0xFF),
202  static_cast<uint8_t>(character >> 8));
203 }
204 
205 
206 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
207 
208 
209 template <typename PatternChar, typename SubjectChar>
210 inline int FindFirstCharacter(Vector<const PatternChar> pattern,
211  Vector<const SubjectChar> subject, int index) {
212  const PatternChar pattern_first_char = pattern[0];
213  const int max_n = (subject.length() - pattern.length() + 1);
214 
215  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
216  const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
217  int pos = index;
218  do {
219  DCHECK_GE(max_n - pos, 0);
220  const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
221  memchr(subject.start() + pos, search_byte,
222  (max_n - pos) * sizeof(SubjectChar)));
223  if (char_pos == nullptr) return -1;
224  char_pos = AlignDown(char_pos, sizeof(SubjectChar));
225  pos = static_cast<int>(char_pos - subject.start());
226  if (subject[pos] == search_char) return pos;
227  } while (++pos < max_n);
228 
229  return -1;
230 }
231 
232 
233 //---------------------------------------------------------------------
234 // Single Character Pattern Search Strategy
235 //---------------------------------------------------------------------
236 
237 template <typename PatternChar, typename SubjectChar>
238 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
239  StringSearch<PatternChar, SubjectChar>* search,
240  Vector<const SubjectChar> subject,
241  int index) {
242  DCHECK_EQ(1, search->pattern_.length());
243  PatternChar pattern_first_char = search->pattern_[0];
244  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
245  if (exceedsOneByte(pattern_first_char)) {
246  return -1;
247  }
248  }
249  return FindFirstCharacter(search->pattern_, subject, index);
250 }
251 
252 //---------------------------------------------------------------------
253 // Linear Search Strategy
254 //---------------------------------------------------------------------
255 
256 
257 template <typename PatternChar, typename SubjectChar>
258 inline bool CharCompare(const PatternChar* pattern,
259  const SubjectChar* subject,
260  int length) {
261  DCHECK_GT(length, 0);
262  int pos = 0;
263  do {
264  if (pattern[pos] != subject[pos]) {
265  return false;
266  }
267  pos++;
268  } while (pos < length);
269  return true;
270 }
271 
272 
273 // Simple linear search for short patterns. Never bails out.
274 template <typename PatternChar, typename SubjectChar>
275 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
276  StringSearch<PatternChar, SubjectChar>* search,
277  Vector<const SubjectChar> subject,
278  int index) {
279  Vector<const PatternChar> pattern = search->pattern_;
280  DCHECK_GT(pattern.length(), 1);
281  int pattern_length = pattern.length();
282  int i = index;
283  int n = subject.length() - pattern_length;
284  while (i <= n) {
285  i = FindFirstCharacter(pattern, subject, i);
286  if (i == -1) return -1;
287  DCHECK_LE(i, n);
288  i++;
289  // Loop extracted to separate function to allow using return to do
290  // a deeper break.
291  if (CharCompare(pattern.start() + 1,
292  subject.start() + i,
293  pattern_length - 1)) {
294  return i - 1;
295  }
296  }
297  return -1;
298 }
299 
300 //---------------------------------------------------------------------
301 // Boyer-Moore string search
302 //---------------------------------------------------------------------
303 
304 template <typename PatternChar, typename SubjectChar>
305 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
306  StringSearch<PatternChar, SubjectChar>* search,
307  Vector<const SubjectChar> subject,
308  int start_index) {
309  Vector<const PatternChar> pattern = search->pattern_;
310  int subject_length = subject.length();
311  int pattern_length = pattern.length();
312  // Only preprocess at most kBMMaxShift last characters of pattern.
313  int start = search->start_;
314 
315  int* bad_char_occurence = search->bad_char_table();
316  int* good_suffix_shift = search->good_suffix_shift_table();
317 
318  PatternChar last_char = pattern[pattern_length - 1];
319  int index = start_index;
320  // Continue search from i.
321  while (index <= subject_length - pattern_length) {
322  int j = pattern_length - 1;
323  int c;
324  while (last_char != (c = subject[index + j])) {
325  int shift =
326  j - CharOccurrence(bad_char_occurence, c);
327  index += shift;
328  if (index > subject_length - pattern_length) {
329  return -1;
330  }
331  }
332  while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
333  if (j < 0) {
334  return index;
335  } else if (j < start) {
336  // we have matched more than our tables allow us to be smart about.
337  // Fall back on BMH shift.
338  index += pattern_length - 1
339  - CharOccurrence(bad_char_occurence,
340  static_cast<SubjectChar>(last_char));
341  } else {
342  int gs_shift = good_suffix_shift[j + 1];
343  int bc_occ =
344  CharOccurrence(bad_char_occurence, c);
345  int shift = j - bc_occ;
346  if (gs_shift > shift) {
347  shift = gs_shift;
348  }
349  index += shift;
350  }
351  }
352 
353  return -1;
354 }
355 
356 
357 template <typename PatternChar, typename SubjectChar>
358 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
359  int pattern_length = pattern_.length();
360  const PatternChar* pattern = pattern_.start();
361  // Only look at the last kBMMaxShift characters of pattern (from start_
362  // to pattern_length).
363  int start = start_;
364  int length = pattern_length - start;
365 
366  // Biased tables so that we can use pattern indices as table indices,
367  // even if we only cover the part of the pattern from offset start.
368  int* shift_table = good_suffix_shift_table();
369  int* suffix_table = this->suffix_table();
370 
371  // Initialize table.
372  for (int i = start; i < pattern_length; i++) {
373  shift_table[i] = length;
374  }
375  shift_table[pattern_length] = 1;
376  suffix_table[pattern_length] = pattern_length + 1;
377 
378  if (pattern_length <= start) {
379  return;
380  }
381 
382  // Find suffixes.
383  PatternChar last_char = pattern[pattern_length - 1];
384  int suffix = pattern_length + 1;
385  {
386  int i = pattern_length;
387  while (i > start) {
388  PatternChar c = pattern[i - 1];
389  while (suffix <= pattern_length && c != pattern[suffix - 1]) {
390  if (shift_table[suffix] == length) {
391  shift_table[suffix] = suffix - i;
392  }
393  suffix = suffix_table[suffix];
394  }
395  suffix_table[--i] = --suffix;
396  if (suffix == pattern_length) {
397  // No suffix to extend, so we check against last_char only.
398  while ((i > start) && (pattern[i - 1] != last_char)) {
399  if (shift_table[pattern_length] == length) {
400  shift_table[pattern_length] = pattern_length - i;
401  }
402  suffix_table[--i] = pattern_length;
403  }
404  if (i > start) {
405  suffix_table[--i] = --suffix;
406  }
407  }
408  }
409  }
410  // Build shift table using suffixes.
411  if (suffix < pattern_length) {
412  for (int i = start; i <= pattern_length; i++) {
413  if (shift_table[i] == length) {
414  shift_table[i] = suffix - start;
415  }
416  if (i == suffix) {
417  suffix = suffix_table[suffix];
418  }
419  }
420  }
421 }
422 
423 //---------------------------------------------------------------------
424 // Boyer-Moore-Horspool string search.
425 //---------------------------------------------------------------------
426 
427 template <typename PatternChar, typename SubjectChar>
428 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
429  StringSearch<PatternChar, SubjectChar>* search,
430  Vector<const SubjectChar> subject,
431  int start_index) {
432  Vector<const PatternChar> pattern = search->pattern_;
433  int subject_length = subject.length();
434  int pattern_length = pattern.length();
435  int* char_occurrences = search->bad_char_table();
436  int badness = -pattern_length;
437 
438  // How bad we are doing without a good-suffix table.
439  PatternChar last_char = pattern[pattern_length - 1];
440  int last_char_shift = pattern_length - 1 -
441  CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
442  // Perform search
443  int index = start_index; // No matches found prior to this index.
444  while (index <= subject_length - pattern_length) {
445  int j = pattern_length - 1;
446  int subject_char;
447  while (last_char != (subject_char = subject[index + j])) {
448  int bc_occ = CharOccurrence(char_occurrences, subject_char);
449  int shift = j - bc_occ;
450  index += shift;
451  badness += 1 - shift; // at most zero, so badness cannot increase.
452  if (index > subject_length - pattern_length) {
453  return -1;
454  }
455  }
456  j--;
457  while (j >= 0 && pattern[j] == (subject[index + j])) j--;
458  if (j < 0) {
459  return index;
460  } else {
461  index += last_char_shift;
462  // Badness increases by the number of characters we have
463  // checked, and decreases by the number of characters we
464  // can skip by shifting. It's a measure of how we are doing
465  // compared to reading each character exactly once.
466  badness += (pattern_length - j) - last_char_shift;
467  if (badness > 0) {
468  search->PopulateBoyerMooreTable();
469  search->strategy_ = &BoyerMooreSearch;
470  return BoyerMooreSearch(search, subject, index);
471  }
472  }
473  }
474  return -1;
475 }
476 
477 
478 template <typename PatternChar, typename SubjectChar>
479 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
480  int pattern_length = pattern_.length();
481 
482  int* bad_char_occurrence = bad_char_table();
483 
484  // Only preprocess at most kBMMaxShift last characters of pattern.
485  int start = start_;
486  // Run forwards to populate bad_char_table, so that *last* instance
487  // of character equivalence class is the one registered.
488  // Notice: Doesn't include the last character.
489  int table_size = AlphabetSize();
490  if (start == 0) { // All patterns less than kBMMaxShift in length.
491  memset(bad_char_occurrence,
492  -1,
493  table_size * sizeof(*bad_char_occurrence));
494  } else {
495  for (int i = 0; i < table_size; i++) {
496  bad_char_occurrence[i] = start - 1;
497  }
498  }
499  for (int i = start; i < pattern_length - 1; i++) {
500  PatternChar c = pattern_[i];
501  int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
502  bad_char_occurrence[bucket] = i;
503  }
504 }
505 
506 //---------------------------------------------------------------------
507 // Linear string search with bailout to BMH.
508 //---------------------------------------------------------------------
509 
510 // Simple linear search for short patterns, which bails out if the string
511 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
512 template <typename PatternChar, typename SubjectChar>
513 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
514  StringSearch<PatternChar, SubjectChar>* search,
515  Vector<const SubjectChar> subject,
516  int index) {
517  Vector<const PatternChar> pattern = search->pattern_;
518  int pattern_length = pattern.length();
519  // Badness is a count of how much work we have done. When we have
520  // done enough work we decide it's probably worth switching to a better
521  // algorithm.
522  int badness = -10 - (pattern_length << 2);
523 
524  // We know our pattern is at least 2 characters, we cache the first so
525  // the common case of the first character not matching is faster.
526  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
527  badness++;
528  if (badness <= 0) {
529  i = FindFirstCharacter(pattern, subject, i);
530  if (i == -1) return -1;
531  DCHECK_LE(i, n);
532  int j = 1;
533  do {
534  if (pattern[j] != subject[i + j]) {
535  break;
536  }
537  j++;
538  } while (j < pattern_length);
539  if (j == pattern_length) {
540  return i;
541  }
542  badness += j;
543  } else {
544  search->PopulateBoyerMooreHorspoolTable();
545  search->strategy_ = &BoyerMooreHorspoolSearch;
546  return BoyerMooreHorspoolSearch(search, subject, i);
547  }
548  }
549  return -1;
550 }
551 
552 
553 // Perform a a single stand-alone search.
554 // If searching multiple times for the same pattern, a search
555 // object should be constructed once and the Search function then called
556 // for each search.
557 template <typename SubjectChar, typename PatternChar>
558 int SearchString(Isolate* isolate,
559  Vector<const SubjectChar> subject,
560  Vector<const PatternChar> pattern,
561  int start_index) {
562  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
563  return search.Search(subject, start_index);
564 }
565 
566 // A wrapper function around SearchString that wraps raw pointers to the subject
567 // and pattern as vectors before calling SearchString. Used from the
568 // StringIndexOf builtin.
569 template <typename SubjectChar, typename PatternChar>
570 intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
571  int subject_length, const PatternChar* pattern_ptr,
572  int pattern_length, int start_index) {
573  DisallowHeapAllocation no_gc;
574  Vector<const SubjectChar> subject(subject_ptr, subject_length);
575  Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
576  return SearchString(isolate, subject, pattern, start_index);
577 }
578 
579 } // namespace internal
580 } // namespace v8
581 
582 #endif // V8_STRING_SEARCH_H_
Definition: libplatform.h:13