V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
jsregexp.cc
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/jsregexp.h"
6 
7 #include <memory>
8 #include <vector>
9 
10 #include "src/base/platform/platform.h"
11 #include "src/code-tracer.h"
12 #include "src/compilation-cache.h"
13 #include "src/elements.h"
14 #include "src/execution.h"
15 #include "src/heap/factory.h"
16 #include "src/isolate-inl.h"
17 #include "src/message-template.h"
18 #include "src/ostreams.h"
19 #include "src/regexp/interpreter-irregexp.h"
20 #include "src/regexp/jsregexp-inl.h"
21 #include "src/regexp/regexp-macro-assembler-irregexp.h"
22 #include "src/regexp/regexp-macro-assembler-tracer.h"
23 #include "src/regexp/regexp-macro-assembler.h"
24 #include "src/regexp/regexp-parser.h"
25 #include "src/regexp/regexp-stack.h"
26 #include "src/runtime/runtime.h"
27 #include "src/splay-tree-inl.h"
28 #include "src/string-search.h"
29 #include "src/unicode-decoder.h"
30 #include "src/unicode-inl.h"
31 
32 #ifdef V8_INTL_SUPPORT
33 #include "unicode/uniset.h"
34 #include "unicode/utypes.h"
35 #endif // V8_INTL_SUPPORT
36 
37 #ifndef V8_INTERPRETED_REGEXP
38 #if V8_TARGET_ARCH_IA32
39 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
40 #elif V8_TARGET_ARCH_X64
41 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
42 #elif V8_TARGET_ARCH_ARM64
43 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
44 #elif V8_TARGET_ARCH_ARM
45 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
46 #elif V8_TARGET_ARCH_PPC
47 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
48 #elif V8_TARGET_ARCH_S390
49 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
50 #elif V8_TARGET_ARCH_MIPS
51 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
52 #elif V8_TARGET_ARCH_MIPS64
53 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
54 #else
55 #error Unsupported target architecture.
56 #endif
57 #endif
58 
59 
60 namespace v8 {
61 namespace internal {
62 
63 V8_WARN_UNUSED_RESULT
64 static inline MaybeHandle<Object> ThrowRegExpException(
65  Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
66  Handle<String> error_text) {
67  THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
68  pattern, error_text),
69  Object);
70 }
71 
72 inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
73  Handle<String> error_text) {
74  USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
75  error_text));
76 }
77 
78 
79 ContainedInLattice AddRange(ContainedInLattice containment,
80  const int* ranges,
81  int ranges_length,
82  Interval new_range) {
83  DCHECK_EQ(1, ranges_length & 1);
84  DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
85  if (containment == kLatticeUnknown) return containment;
86  bool inside = false;
87  int last = 0;
88  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
89  // Consider the range from last to ranges[i].
90  // We haven't got to the new range yet.
91  if (ranges[i] <= new_range.from()) continue;
92  // New range is wholly inside last-ranges[i]. Note that new_range.to() is
93  // inclusive, but the values in ranges are not.
94  if (last <= new_range.from() && new_range.to() < ranges[i]) {
95  return Combine(containment, inside ? kLatticeIn : kLatticeOut);
96  }
97  return kLatticeUnknown;
98  }
99  return containment;
100 }
101 
102 // More makes code generation slower, less makes V8 benchmark score lower.
103 const int kMaxLookaheadForBoyerMoore = 8;
104 // In a 3-character pattern you can maximally step forwards 3 characters
105 // at a time, which is not always enough to pay for the extra logic.
106 const int kPatternTooShortForBoyerMoore = 2;
107 
108 // Identifies the sort of regexps where the regexp engine is faster
109 // than the code used for atom matches.
110 static bool HasFewDifferentCharacters(Handle<String> pattern) {
111  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112  if (length <= kPatternTooShortForBoyerMoore) return false;
113  const int kMod = 128;
114  bool character_found[kMod];
115  int different = 0;
116  memset(&character_found[0], 0, sizeof(character_found));
117  for (int i = 0; i < length; i++) {
118  int ch = (pattern->Get(i) & (kMod - 1));
119  if (!character_found[ch]) {
120  character_found[ch] = true;
121  different++;
122  // We declare a regexp low-alphabet if it has at least 3 times as many
123  // characters as it has different characters.
124  if (different * 3 > length) return false;
125  }
126  }
127  return true;
128 }
129 
130 // Generic RegExp methods. Dispatches to implementation specific methods.
131 
132 MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
133  Handle<String> pattern,
134  JSRegExp::Flags flags) {
135  DCHECK(pattern->IsFlat());
136 
137  Zone zone(isolate->allocator(), ZONE_NAME);
138  CompilationCache* compilation_cache = isolate->compilation_cache();
139  MaybeHandle<FixedArray> maybe_cached =
140  compilation_cache->LookupRegExp(pattern, flags);
141  Handle<FixedArray> cached;
142  if (maybe_cached.ToHandle(&cached)) {
143  re->set_data(*cached);
144  return re;
145  }
146 
147  PostponeInterruptsScope postpone(isolate);
148  RegExpCompileData parse_result;
149  FlatStringReader reader(isolate, pattern);
150  DCHECK(!isolate->has_pending_exception());
151  if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
152  &parse_result)) {
153  // Throw an exception if we fail to parse the pattern.
154  return ThrowRegExpException(isolate, re, pattern, parse_result.error);
155  }
156 
157  bool has_been_compiled = false;
158 
159  if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
160  !HasFewDifferentCharacters(pattern)) {
161  // Parse-tree is a single atom that is equal to the pattern.
162  AtomCompile(isolate, re, pattern, flags, pattern);
163  has_been_compiled = true;
164  } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
165  parse_result.capture_count == 0) {
166  RegExpAtom* atom = parse_result.tree->AsAtom();
167  Vector<const uc16> atom_pattern = atom->data();
168  Handle<String> atom_string;
169  ASSIGN_RETURN_ON_EXCEPTION(
170  isolate, atom_string,
171  isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
172  if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
173  AtomCompile(isolate, re, pattern, flags, atom_string);
174  has_been_compiled = true;
175  }
176  }
177  if (!has_been_compiled) {
178  IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
179  }
180  DCHECK(re->data()->IsFixedArray());
181  // Compilation succeeded so the data is set on the regexp
182  // and we can store it in the cache.
183  Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
184  compilation_cache->PutRegExp(pattern, flags, data);
185 
186  return re;
187 }
188 
189 MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
190  Handle<String> subject, int index,
191  Handle<RegExpMatchInfo> last_match_info) {
192  switch (regexp->TypeTag()) {
193  case JSRegExp::ATOM:
194  return AtomExec(isolate, regexp, subject, index, last_match_info);
195  case JSRegExp::IRREGEXP: {
196  return IrregexpExec(isolate, regexp, subject, index, last_match_info);
197  }
198  default:
199  UNREACHABLE();
200  }
201 }
202 
203 
204 // RegExp Atom implementation: Simple string search using indexOf.
205 
206 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
207  Handle<String> pattern, JSRegExp::Flags flags,
208  Handle<String> match_pattern) {
209  isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
210  match_pattern);
211 }
212 
213 static void SetAtomLastCapture(Isolate* isolate,
214  Handle<RegExpMatchInfo> last_match_info,
215  String subject, int from, int to) {
216  SealHandleScope shs(isolate);
217  last_match_info->SetNumberOfCaptureRegisters(2);
218  last_match_info->SetLastSubject(subject);
219  last_match_info->SetLastInput(subject);
220  last_match_info->SetCapture(0, from);
221  last_match_info->SetCapture(1, to);
222 }
223 
224 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
225  Handle<String> subject, int index, int32_t* output,
226  int output_size) {
227  DCHECK_LE(0, index);
228  DCHECK_LE(index, subject->length());
229 
230  subject = String::Flatten(isolate, subject);
231  DisallowHeapAllocation no_gc; // ensure vectors stay valid
232 
233  String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
234  int needle_len = needle->length();
235  DCHECK(needle->IsFlat());
236  DCHECK_LT(0, needle_len);
237 
238  if (index + needle_len > subject->length()) {
239  return RegExpImpl::RE_FAILURE;
240  }
241 
242  for (int i = 0; i < output_size; i += 2) {
243  String::FlatContent needle_content = needle->GetFlatContent();
244  String::FlatContent subject_content = subject->GetFlatContent();
245  DCHECK(needle_content.IsFlat());
246  DCHECK(subject_content.IsFlat());
247  // dispatch on type of strings
248  index =
249  (needle_content.IsOneByte()
250  ? (subject_content.IsOneByte()
251  ? SearchString(isolate, subject_content.ToOneByteVector(),
252  needle_content.ToOneByteVector(), index)
253  : SearchString(isolate, subject_content.ToUC16Vector(),
254  needle_content.ToOneByteVector(), index))
255  : (subject_content.IsOneByte()
256  ? SearchString(isolate, subject_content.ToOneByteVector(),
257  needle_content.ToUC16Vector(), index)
258  : SearchString(isolate, subject_content.ToUC16Vector(),
259  needle_content.ToUC16Vector(), index)));
260  if (index == -1) {
261  return i / 2; // Return number of matches.
262  } else {
263  output[i] = index;
264  output[i+1] = index + needle_len;
265  index += needle_len;
266  }
267  }
268  return output_size / 2;
269 }
270 
271 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
272  Handle<String> subject, int index,
273  Handle<RegExpMatchInfo> last_match_info) {
274  static const int kNumRegisters = 2;
275  STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
276  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
277 
278  int res =
279  AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
280 
281  if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
282 
283  DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
284  SealHandleScope shs(isolate);
285  SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
286  output_registers[1]);
287  return last_match_info;
288 }
289 
290 
291 // Irregexp implementation.
292 
293 // Ensures that the regexp object contains a compiled version of the
294 // source for either one-byte or two-byte subject strings.
295 // If the compiled version doesn't already exist, it is compiled
296 // from the source pattern.
297 // If compilation fails, an exception is thrown and this function
298 // returns false.
299 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
300  Handle<String> sample_subject,
301  bool is_one_byte) {
302  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
303 #ifdef V8_INTERPRETED_REGEXP
304  if (compiled_code->IsByteArray()) return true;
305 #else // V8_INTERPRETED_REGEXP (RegExp native code)
306  if (compiled_code->IsCode()) return true;
307 #endif
308  return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
309 }
310 
311 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
312  Handle<String> sample_subject,
313  bool is_one_byte) {
314  // Compile the RegExp.
315  Zone zone(isolate->allocator(), ZONE_NAME);
316  PostponeInterruptsScope postpone(isolate);
317 #ifdef DEBUG
318  Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
319  // When arriving here entry can only be a smi representing an uncompiled
320  // regexp.
321  DCHECK(entry->IsSmi());
322  int entry_value = Smi::ToInt(entry);
323  DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
324 #endif
325 
326  JSRegExp::Flags flags = re->GetFlags();
327 
328  Handle<String> pattern(re->Pattern(), isolate);
329  pattern = String::Flatten(isolate, pattern);
330  RegExpCompileData compile_data;
331  FlatStringReader reader(isolate, pattern);
332  if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
333  &compile_data)) {
334  // Throw an exception if we fail to parse the pattern.
335  // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
336  USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
337  return false;
338  }
339  RegExpEngine::CompilationResult result =
340  RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
341  sample_subject, is_one_byte);
342  if (result.error_message != nullptr) {
343  // Unable to compile regexp.
344  if (FLAG_abort_on_stack_or_string_length_overflow &&
345  strncmp(result.error_message, "Stack overflow", 15) == 0) {
346  FATAL("Aborting on stack overflow");
347  }
348  Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
349  CStrVector(result.error_message)).ToHandleChecked();
350  ThrowRegExpException(isolate, re, error_message);
351  return false;
352  }
353 
354  Handle<FixedArray> data =
355  Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
356  data->set(JSRegExp::code_index(is_one_byte), result.code);
357  SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
358  int register_max = IrregexpMaxRegisterCount(*data);
359  if (result.num_registers > register_max) {
360  SetIrregexpMaxRegisterCount(*data, result.num_registers);
361  }
362 
363  return true;
364 }
365 
366 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
367  return Smi::cast(
368  re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
369 }
370 
371 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
372  re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
373 }
374 
375 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re,
376  Handle<FixedArray> value) {
377  if (value.is_null()) {
378  re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
379  } else {
380  re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
381  }
382 }
383 
384 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
385  return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
386 }
387 
388 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray re) {
389  return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
390 }
391 
392 ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
393  return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
394 }
395 
396 Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
397  return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
398 }
399 
400 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
401  Handle<String> pattern,
402  JSRegExp::Flags flags, int capture_count) {
403  // Initialize compiled code entries to null.
404  isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
405  flags, capture_count);
406 }
407 
408 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
409  Handle<String> subject) {
410  DCHECK(subject->IsFlat());
411 
412  // Check representation of the underlying storage.
413  bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
414  if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
415 
416 #ifdef V8_INTERPRETED_REGEXP
417  // Byte-code regexp needs space allocated for all its registers.
418  // The result captures are copied to the start of the registers array
419  // if the match succeeds. This way those registers are not clobbered
420  // when we set the last match info from last successful match.
421  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
422  (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
423 #else // V8_INTERPRETED_REGEXP
424  // Native regexp only needs room to output captures. Registers are handled
425  // internally.
426  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
427 #endif // V8_INTERPRETED_REGEXP
428 }
429 
430 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
431  Handle<String> subject, int index,
432  int32_t* output, int output_size) {
433  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
434 
435  DCHECK_LE(0, index);
436  DCHECK_LE(index, subject->length());
437  DCHECK(subject->IsFlat());
438 
439  bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
440 
441 #ifndef V8_INTERPRETED_REGEXP
442  DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
443  do {
444  EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
445  Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
446  // The stack is used to allocate registers for the compiled regexp code.
447  // This means that in case of failure, the output registers array is left
448  // untouched and contains the capture results from the previous successful
449  // match. We can use that to set the last match info lazily.
450  NativeRegExpMacroAssembler::Result res =
451  NativeRegExpMacroAssembler::Match(code,
452  subject,
453  output,
454  output_size,
455  index,
456  isolate);
457  if (res != NativeRegExpMacroAssembler::RETRY) {
458  DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
459  isolate->has_pending_exception());
460  STATIC_ASSERT(
461  static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
462  STATIC_ASSERT(
463  static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
464  STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
465  == RE_EXCEPTION);
466  return static_cast<IrregexpResult>(res);
467  }
468  // If result is RETRY, the string has changed representation, and we
469  // must restart from scratch.
470  // In this case, it means we must make sure we are prepared to handle
471  // the, potentially, different subject (the string can switch between
472  // being internal and external, and even between being Latin1 and UC16,
473  // but the characters are always the same).
474  IrregexpPrepare(isolate, regexp, subject);
475  is_one_byte = subject->IsOneByteRepresentationUnderneath();
476  } while (true);
477  UNREACHABLE();
478 #else // V8_INTERPRETED_REGEXP
479 
480  DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
481  // We must have done EnsureCompiledIrregexp, so we can get the number of
482  // registers.
483  int number_of_capture_registers =
484  (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
485  int32_t* raw_output = &output[number_of_capture_registers];
486  // We do not touch the actual capture result registers until we know there
487  // has been a match so that we can use those capture results to set the
488  // last match info.
489  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
490  raw_output[i] = -1;
491  }
492  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
493  isolate);
494 
495  IrregexpResult result = IrregexpInterpreter::Match(isolate,
496  byte_codes,
497  subject,
498  raw_output,
499  index);
500  if (result == RE_SUCCESS) {
501  // Copy capture results to the start of the registers array.
502  MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
503  }
504  if (result == RE_EXCEPTION) {
505  DCHECK(!isolate->has_pending_exception());
506  isolate->StackOverflow();
507  }
508  return result;
509 #endif // V8_INTERPRETED_REGEXP
510 }
511 
512 MaybeHandle<Object> RegExpImpl::IrregexpExec(
513  Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
514  int previous_index, Handle<RegExpMatchInfo> last_match_info) {
515  DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
516 
517  subject = String::Flatten(isolate, subject);
518 
519  // Prepare space for the return values.
520 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
521  if (FLAG_trace_regexp_bytecodes) {
522  String pattern = regexp->Pattern();
523  PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
524  PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
525  }
526 #endif
527  int required_registers =
528  RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
529  if (required_registers < 0) {
530  // Compiling failed with an exception.
531  DCHECK(isolate->has_pending_exception());
532  return MaybeHandle<Object>();
533  }
534 
535  int32_t* output_registers = nullptr;
536  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
537  output_registers = NewArray<int32_t>(required_registers);
538  }
539  std::unique_ptr<int32_t[]> auto_release(output_registers);
540  if (output_registers == nullptr) {
541  output_registers = isolate->jsregexp_static_offsets_vector();
542  }
543 
544  int res =
545  RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
546  output_registers, required_registers);
547  if (res == RE_SUCCESS) {
548  int capture_count =
549  IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
550  return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
551  output_registers);
552  }
553  if (res == RE_EXCEPTION) {
554  DCHECK(isolate->has_pending_exception());
555  return MaybeHandle<Object>();
556  }
557  DCHECK(res == RE_FAILURE);
558  return isolate->factory()->null_value();
559 }
560 
561 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
562  Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
563  Handle<String> subject, int capture_count, int32_t* match) {
564  // This is the only place where match infos can grow. If, after executing the
565  // regexp, RegExpExecStub finds that the match info is too small, it restarts
566  // execution in RegExpImpl::Exec, which finally grows the match info right
567  // here.
568 
569  int capture_register_count = (capture_count + 1) * 2;
570  Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
571  isolate, last_match_info, capture_register_count);
572  result->SetNumberOfCaptureRegisters(capture_register_count);
573 
574  if (*result != *last_match_info) {
575  // The match info has been reallocated, update the corresponding reference
576  // on the native context.
577  if (*last_match_info == *isolate->regexp_last_match_info()) {
578  isolate->native_context()->set_regexp_last_match_info(*result);
579  } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
580  isolate->native_context()->set_regexp_internal_match_info(*result);
581  }
582  }
583 
584  DisallowHeapAllocation no_allocation;
585  if (match != nullptr) {
586  for (int i = 0; i < capture_register_count; i += 2) {
587  result->SetCapture(i, match[i]);
588  result->SetCapture(i + 1, match[i + 1]);
589  }
590  }
591  result->SetLastSubject(*subject);
592  result->SetLastInput(*subject);
593  return result;
594 }
595 
596 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
597  Handle<String> subject, Isolate* isolate)
598  : register_array_(nullptr),
599  register_array_size_(0),
600  regexp_(regexp),
601  subject_(subject),
602  isolate_(isolate) {
603 #ifdef V8_INTERPRETED_REGEXP
604  bool interpreted = true;
605 #else
606  bool interpreted = false;
607 #endif // V8_INTERPRETED_REGEXP
608 
609  if (regexp_->TypeTag() == JSRegExp::ATOM) {
610  static const int kAtomRegistersPerMatch = 2;
611  registers_per_match_ = kAtomRegistersPerMatch;
612  // There is no distinction between interpreted and native for atom regexps.
613  interpreted = false;
614  } else {
615  registers_per_match_ =
616  RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
617  if (registers_per_match_ < 0) {
618  num_matches_ = -1; // Signal exception.
619  return;
620  }
621  }
622 
623  DCHECK(IsGlobal(regexp->GetFlags()));
624  if (!interpreted) {
625  register_array_size_ =
626  Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
627  max_matches_ = register_array_size_ / registers_per_match_;
628  } else {
629  // Global loop in interpreted regexp is not implemented. We choose
630  // the size of the offsets vector so that it can only store one match.
631  register_array_size_ = registers_per_match_;
632  max_matches_ = 1;
633  }
634 
635  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
636  register_array_ = NewArray<int32_t>(register_array_size_);
637  } else {
638  register_array_ = isolate->jsregexp_static_offsets_vector();
639  }
640 
641  // Set state so that fetching the results the first time triggers a call
642  // to the compiled regexp.
643  current_match_index_ = max_matches_ - 1;
644  num_matches_ = max_matches_;
645  DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
646  DCHECK_GE(register_array_size_, registers_per_match_);
647  int32_t* last_match =
648  &register_array_[current_match_index_ * registers_per_match_];
649  last_match[0] = -1;
650  last_match[1] = 0;
651 }
652 
653 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
654  if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
655  unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
656  unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
657  // Advance over the surrogate pair.
658  return last_index + 2;
659  }
660  return last_index + 1;
661 }
662 
663 // -------------------------------------------------------------------
664 // Implementation of the Irregexp regular expression engine.
665 //
666 // The Irregexp regular expression engine is intended to be a complete
667 // implementation of ECMAScript regular expressions. It generates either
668 // bytecodes or native code.
669 
670 // The Irregexp regexp engine is structured in three steps.
671 // 1) The parser generates an abstract syntax tree. See ast.cc.
672 // 2) From the AST a node network is created. The nodes are all
673 // subclasses of RegExpNode. The nodes represent states when
674 // executing a regular expression. Several optimizations are
675 // performed on the node network.
676 // 3) From the nodes we generate either byte codes or native code
677 // that can actually execute the regular expression (perform
678 // the search). The code generation step is described in more
679 // detail below.
680 
681 // Code generation.
682 //
683 // The nodes are divided into four main categories.
684 // * Choice nodes
685 // These represent places where the regular expression can
686 // match in more than one way. For example on entry to an
687 // alternation (foo|bar) or a repetition (*, +, ? or {}).
688 // * Action nodes
689 // These represent places where some action should be
690 // performed. Examples include recording the current position
691 // in the input string to a register (in order to implement
692 // captures) or other actions on register for example in order
693 // to implement the counters needed for {} repetitions.
694 // * Matching nodes
695 // These attempt to match some element part of the input string.
696 // Examples of elements include character classes, plain strings
697 // or back references.
698 // * End nodes
699 // These are used to implement the actions required on finding
700 // a successful match or failing to find a match.
701 //
702 // The code generated (whether as byte codes or native code) maintains
703 // some state as it runs. This consists of the following elements:
704 //
705 // * The capture registers. Used for string captures.
706 // * Other registers. Used for counters etc.
707 // * The current position.
708 // * The stack of backtracking information. Used when a matching node
709 // fails to find a match and needs to try an alternative.
710 //
711 // Conceptual regular expression execution model:
712 //
713 // There is a simple conceptual model of regular expression execution
714 // which will be presented first. The actual code generated is a more
715 // efficient simulation of the simple conceptual model:
716 //
717 // * Choice nodes are implemented as follows:
718 // For each choice except the last {
719 // push current position
720 // push backtrack code location
721 // <generate code to test for choice>
722 // backtrack code location:
723 // pop current position
724 // }
725 // <generate code to test for last choice>
726 //
727 // * Actions nodes are generated as follows
728 // <push affected registers on backtrack stack>
729 // <generate code to perform action>
730 // push backtrack code location
731 // <generate code to test for following nodes>
732 // backtrack code location:
733 // <pop affected registers to restore their state>
734 // <pop backtrack location from stack and go to it>
735 //
736 // * Matching nodes are generated as follows:
737 // if input string matches at current position
738 // update current position
739 // <generate code to test for following nodes>
740 // else
741 // <pop backtrack location from stack and go to it>
742 //
743 // Thus it can be seen that the current position is saved and restored
744 // by the choice nodes, whereas the registers are saved and restored by
745 // by the action nodes that manipulate them.
746 //
747 // The other interesting aspect of this model is that nodes are generated
748 // at the point where they are needed by a recursive call to Emit(). If
749 // the node has already been code generated then the Emit() call will
750 // generate a jump to the previously generated code instead. In order to
751 // limit recursion it is possible for the Emit() function to put the node
752 // on a work list for later generation and instead generate a jump. The
753 // destination of the jump is resolved later when the code is generated.
754 //
755 // Actual regular expression code generation.
756 //
757 // Code generation is actually more complicated than the above. In order
758 // to improve the efficiency of the generated code some optimizations are
759 // performed
760 //
761 // * Choice nodes have 1-character lookahead.
762 // A choice node looks at the following character and eliminates some of
763 // the choices immediately based on that character. This is not yet
764 // implemented.
765 // * Simple greedy loops store reduced backtracking information.
766 // A quantifier like /.*foo/m will greedily match the whole input. It will
767 // then need to backtrack to a point where it can match "foo". The naive
768 // implementation of this would push each character position onto the
769 // backtracking stack, then pop them off one by one. This would use space
770 // proportional to the length of the input string. However since the "."
771 // can only match in one way and always has a constant length (in this case
772 // of 1) it suffices to store the current position on the top of the stack
773 // once. Matching now becomes merely incrementing the current position and
774 // backtracking becomes decrementing the current position and checking the
775 // result against the stored current position. This is faster and saves
776 // space.
777 // * The current state is virtualized.
778 // This is used to defer expensive operations until it is clear that they
779 // are needed and to generate code for a node more than once, allowing
780 // specialized an efficient versions of the code to be created. This is
781 // explained in the section below.
782 //
783 // Execution state virtualization.
784 //
785 // Instead of emitting code, nodes that manipulate the state can record their
786 // manipulation in an object called the Trace. The Trace object can record a
787 // current position offset, an optional backtrack code location on the top of
788 // the virtualized backtrack stack and some register changes. When a node is
789 // to be emitted it can flush the Trace or update it. Flushing the Trace
790 // will emit code to bring the actual state into line with the virtual state.
791 // Avoiding flushing the state can postpone some work (e.g. updates of capture
792 // registers). Postponing work can save time when executing the regular
793 // expression since it may be found that the work never has to be done as a
794 // failure to match can occur. In addition it is much faster to jump to a
795 // known backtrack code location than it is to pop an unknown backtrack
796 // location from the stack and jump there.
797 //
798 // The virtual state found in the Trace affects code generation. For example
799 // the virtual state contains the difference between the actual current
800 // position and the virtual current position, and matching code needs to use
801 // this offset to attempt a match in the correct location of the input
802 // string. Therefore code generated for a non-trivial trace is specialized
803 // to that trace. The code generator therefore has the ability to generate
804 // code for each node several times. In order to limit the size of the
805 // generated code there is an arbitrary limit on how many specialized sets of
806 // code may be generated for a given node. If the limit is reached, the
807 // trace is flushed and a generic version of the code for a node is emitted.
808 // This is subsequently used for that node. The code emitted for non-generic
809 // trace is not recorded in the node and so it cannot currently be reused in
810 // the event that code generation is requested for an identical trace.
811 
812 
813 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
814  UNREACHABLE();
815 }
816 
817 
818 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
819  text->AddElement(TextElement::Atom(this), zone);
820 }
821 
822 
823 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
824  text->AddElement(TextElement::CharClass(this), zone);
825 }
826 
827 
828 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
829  for (int i = 0; i < elements()->length(); i++)
830  text->AddElement(elements()->at(i), zone);
831 }
832 
833 
834 TextElement TextElement::Atom(RegExpAtom* atom) {
835  return TextElement(ATOM, atom);
836 }
837 
838 
839 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
840  return TextElement(CHAR_CLASS, char_class);
841 }
842 
843 
844 int TextElement::length() const {
845  switch (text_type()) {
846  case ATOM:
847  return atom()->length();
848 
849  case CHAR_CLASS:
850  return 1;
851  }
852  UNREACHABLE();
853 }
854 
855 
856 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
857  if (table_ == nullptr) {
858  table_ = new(zone()) DispatchTable(zone());
859  DispatchTableConstructor cons(table_, ignore_case, zone());
860  cons.BuildTable(this);
861  }
862  return table_;
863 }
864 
865 
867  public:
868  FrequencyCollator() : total_samples_(0) {
869  for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
870  frequencies_[i] = CharacterFrequency(i);
871  }
872  }
873 
874  void CountCharacter(int character) {
875  int index = (character & RegExpMacroAssembler::kTableMask);
876  frequencies_[index].Increment();
877  total_samples_++;
878  }
879 
880  // Does not measure in percent, but rather per-128 (the table size from the
881  // regexp macro assembler).
882  int Frequency(int in_character) {
883  DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
884  if (total_samples_ < 1) return 1; // Division by zero.
885  int freq_in_per128 =
886  (frequencies_[in_character].counter() * 128) / total_samples_;
887  return freq_in_per128;
888  }
889 
890  private:
891  class CharacterFrequency {
892  public:
893  CharacterFrequency() : counter_(0), character_(-1) { }
894  explicit CharacterFrequency(int character)
895  : counter_(0), character_(character) { }
896 
897  void Increment() { counter_++; }
898  int counter() { return counter_; }
899  int character() { return character_; }
900 
901  private:
902  int counter_;
903  int character_;
904  };
905 
906 
907  private:
908  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
909  int total_samples_;
910 };
911 
912 
914  public:
915  RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
916  bool is_one_byte);
917 
918  int AllocateRegister() {
919  if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
920  reg_exp_too_big_ = true;
921  return next_register_;
922  }
923  return next_register_++;
924  }
925 
926  // Lookarounds to match lone surrogates for unicode character class matches
927  // are never nested. We can therefore reuse registers.
928  int UnicodeLookaroundStackRegister() {
929  if (unicode_lookaround_stack_register_ == kNoRegister) {
930  unicode_lookaround_stack_register_ = AllocateRegister();
931  }
932  return unicode_lookaround_stack_register_;
933  }
934 
935  int UnicodeLookaroundPositionRegister() {
936  if (unicode_lookaround_position_register_ == kNoRegister) {
937  unicode_lookaround_position_register_ = AllocateRegister();
938  }
939  return unicode_lookaround_position_register_;
940  }
941 
942  RegExpEngine::CompilationResult Assemble(Isolate* isolate,
943  RegExpMacroAssembler* assembler,
944  RegExpNode* start, int capture_count,
945  Handle<String> pattern);
946 
947  inline void AddWork(RegExpNode* node) {
948  if (!node->on_work_list() && !node->label()->is_bound()) {
949  node->set_on_work_list(true);
950  work_list_->push_back(node);
951  }
952  }
953 
954  static const int kImplementationOffset = 0;
955  static const int kNumberOfRegistersOffset = 0;
956  static const int kCodeOffset = 1;
957 
958  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
959  EndNode* accept() { return accept_; }
960 
961  static const int kMaxRecursion = 100;
962  inline int recursion_depth() { return recursion_depth_; }
963  inline void IncrementRecursionDepth() { recursion_depth_++; }
964  inline void DecrementRecursionDepth() { recursion_depth_--; }
965 
966  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
967 
968  inline bool one_byte() { return one_byte_; }
969  inline bool optimize() { return optimize_; }
970  inline void set_optimize(bool value) { optimize_ = value; }
971  inline bool limiting_recursion() { return limiting_recursion_; }
972  inline void set_limiting_recursion(bool value) {
973  limiting_recursion_ = value;
974  }
975  bool read_backward() { return read_backward_; }
976  void set_read_backward(bool value) { read_backward_ = value; }
977  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
978 
979  int current_expansion_factor() { return current_expansion_factor_; }
980  void set_current_expansion_factor(int value) {
981  current_expansion_factor_ = value;
982  }
983 
984  Isolate* isolate() const { return isolate_; }
985  Zone* zone() const { return zone_; }
986 
987  static const int kNoRegister = -1;
988 
989  private:
990  EndNode* accept_;
991  int next_register_;
992  int unicode_lookaround_stack_register_;
993  int unicode_lookaround_position_register_;
994  std::vector<RegExpNode*>* work_list_;
995  int recursion_depth_;
996  RegExpMacroAssembler* macro_assembler_;
997  bool one_byte_;
998  bool reg_exp_too_big_;
999  bool limiting_recursion_;
1000  bool optimize_;
1001  bool read_backward_;
1002  int current_expansion_factor_;
1003  FrequencyCollator frequency_collator_;
1004  Isolate* isolate_;
1005  Zone* zone_;
1006 };
1007 
1008 
1010  public:
1011  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1012  compiler->IncrementRecursionDepth();
1013  }
1014  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1015  private:
1016  RegExpCompiler* compiler_;
1017 };
1018 
1019 
1020 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1021  return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1022 }
1023 
1024 
1025 // Attempts to compile the regexp using an Irregexp code generator. Returns
1026 // a fixed array or a null handle depending on whether it succeeded.
1027 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1028  bool one_byte)
1029  : next_register_(2 * (capture_count + 1)),
1030  unicode_lookaround_stack_register_(kNoRegister),
1031  unicode_lookaround_position_register_(kNoRegister),
1032  work_list_(nullptr),
1033  recursion_depth_(0),
1034  one_byte_(one_byte),
1035  reg_exp_too_big_(false),
1036  limiting_recursion_(false),
1037  optimize_(FLAG_regexp_optimization),
1038  read_backward_(false),
1039  current_expansion_factor_(1),
1040  frequency_collator_(),
1041  isolate_(isolate),
1042  zone_(zone) {
1043  accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1044  DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1045 }
1046 
1047 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1048  Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1049  int capture_count, Handle<String> pattern) {
1050 #ifdef DEBUG
1051  if (FLAG_trace_regexp_assembler)
1052  macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1053  else
1054 #endif
1055  macro_assembler_ = macro_assembler;
1056 
1057  std::vector<RegExpNode*> work_list;
1058  work_list_ = &work_list;
1059  Label fail;
1060  macro_assembler_->PushBacktrack(&fail);
1061  Trace new_trace;
1062  start->Emit(this, &new_trace);
1063  macro_assembler_->Bind(&fail);
1064  macro_assembler_->Fail();
1065  while (!work_list.empty()) {
1066  RegExpNode* node = work_list.back();
1067  work_list.pop_back();
1068  node->set_on_work_list(false);
1069  if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1070  }
1071  if (reg_exp_too_big_) {
1072  macro_assembler_->AbortedCodeGeneration();
1073  return IrregexpRegExpTooBig(isolate_);
1074  }
1075 
1076  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1077  isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1078  work_list_ = nullptr;
1079 #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP)
1080  if (FLAG_print_code) {
1081  CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1082  OFStream os(trace_scope.file());
1083  Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1084  }
1085 #endif
1086 #ifdef DEBUG
1087  if (FLAG_trace_regexp_assembler) {
1088  delete macro_assembler_;
1089  }
1090 #endif
1091  return RegExpEngine::CompilationResult(*code, next_register_);
1092 }
1093 
1094 
1095 bool Trace::DeferredAction::Mentions(int that) {
1096  if (action_type() == ActionNode::CLEAR_CAPTURES) {
1097  Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1098  return range.Contains(that);
1099  } else {
1100  return reg() == that;
1101  }
1102 }
1103 
1104 
1105 bool Trace::mentions_reg(int reg) {
1106  for (DeferredAction* action = actions_; action != nullptr;
1107  action = action->next()) {
1108  if (action->Mentions(reg))
1109  return true;
1110  }
1111  return false;
1112 }
1113 
1114 
1115 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1116  DCHECK_EQ(0, *cp_offset);
1117  for (DeferredAction* action = actions_; action != nullptr;
1118  action = action->next()) {
1119  if (action->Mentions(reg)) {
1120  if (action->action_type() == ActionNode::STORE_POSITION) {
1121  *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1122  return true;
1123  } else {
1124  return false;
1125  }
1126  }
1127  }
1128  return false;
1129 }
1130 
1131 
1132 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1133  Zone* zone) {
1134  int max_register = RegExpCompiler::kNoRegister;
1135  for (DeferredAction* action = actions_; action != nullptr;
1136  action = action->next()) {
1137  if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1138  Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1139  for (int i = range.from(); i <= range.to(); i++)
1140  affected_registers->Set(i, zone);
1141  if (range.to() > max_register) max_register = range.to();
1142  } else {
1143  affected_registers->Set(action->reg(), zone);
1144  if (action->reg() > max_register) max_register = action->reg();
1145  }
1146  }
1147  return max_register;
1148 }
1149 
1150 
1151 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1152  int max_register,
1153  const OutSet& registers_to_pop,
1154  const OutSet& registers_to_clear) {
1155  for (int reg = max_register; reg >= 0; reg--) {
1156  if (registers_to_pop.Get(reg)) {
1157  assembler->PopRegister(reg);
1158  } else if (registers_to_clear.Get(reg)) {
1159  int clear_to = reg;
1160  while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1161  reg--;
1162  }
1163  assembler->ClearRegisters(reg, clear_to);
1164  }
1165  }
1166 }
1167 
1168 
1169 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1170  int max_register,
1171  const OutSet& affected_registers,
1172  OutSet* registers_to_pop,
1173  OutSet* registers_to_clear,
1174  Zone* zone) {
1175  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1176  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1177 
1178  // Count pushes performed to force a stack limit check occasionally.
1179  int pushes = 0;
1180 
1181  for (int reg = 0; reg <= max_register; reg++) {
1182  if (!affected_registers.Get(reg)) {
1183  continue;
1184  }
1185 
1186  // The chronologically first deferred action in the trace
1187  // is used to infer the action needed to restore a register
1188  // to its previous state (or not, if it's safe to ignore it).
1189  enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1190  DeferredActionUndoType undo_action = IGNORE;
1191 
1192  int value = 0;
1193  bool absolute = false;
1194  bool clear = false;
1195  static const int kNoStore = kMinInt;
1196  int store_position = kNoStore;
1197  // This is a little tricky because we are scanning the actions in reverse
1198  // historical order (newest first).
1199  for (DeferredAction* action = actions_; action != nullptr;
1200  action = action->next()) {
1201  if (action->Mentions(reg)) {
1202  switch (action->action_type()) {
1203  case ActionNode::SET_REGISTER: {
1204  Trace::DeferredSetRegister* psr =
1205  static_cast<Trace::DeferredSetRegister*>(action);
1206  if (!absolute) {
1207  value += psr->value();
1208  absolute = true;
1209  }
1210  // SET_REGISTER is currently only used for newly introduced loop
1211  // counters. They can have a significant previous value if they
1212  // occur in a loop. TODO(lrn): Propagate this information, so
1213  // we can set undo_action to IGNORE if we know there is no value to
1214  // restore.
1215  undo_action = RESTORE;
1216  DCHECK_EQ(store_position, kNoStore);
1217  DCHECK(!clear);
1218  break;
1219  }
1220  case ActionNode::INCREMENT_REGISTER:
1221  if (!absolute) {
1222  value++;
1223  }
1224  DCHECK_EQ(store_position, kNoStore);
1225  DCHECK(!clear);
1226  undo_action = RESTORE;
1227  break;
1228  case ActionNode::STORE_POSITION: {
1229  Trace::DeferredCapture* pc =
1230  static_cast<Trace::DeferredCapture*>(action);
1231  if (!clear && store_position == kNoStore) {
1232  store_position = pc->cp_offset();
1233  }
1234 
1235  // For captures we know that stores and clears alternate.
1236  // Other register, are never cleared, and if the occur
1237  // inside a loop, they might be assigned more than once.
1238  if (reg <= 1) {
1239  // Registers zero and one, aka "capture zero", is
1240  // always set correctly if we succeed. There is no
1241  // need to undo a setting on backtrack, because we
1242  // will set it again or fail.
1243  undo_action = IGNORE;
1244  } else {
1245  undo_action = pc->is_capture() ? CLEAR : RESTORE;
1246  }
1247  DCHECK(!absolute);
1248  DCHECK_EQ(value, 0);
1249  break;
1250  }
1251  case ActionNode::CLEAR_CAPTURES: {
1252  // Since we're scanning in reverse order, if we've already
1253  // set the position we have to ignore historically earlier
1254  // clearing operations.
1255  if (store_position == kNoStore) {
1256  clear = true;
1257  }
1258  undo_action = RESTORE;
1259  DCHECK(!absolute);
1260  DCHECK_EQ(value, 0);
1261  break;
1262  }
1263  default:
1264  UNREACHABLE();
1265  break;
1266  }
1267  }
1268  }
1269  // Prepare for the undo-action (e.g., push if it's going to be popped).
1270  if (undo_action == RESTORE) {
1271  pushes++;
1272  RegExpMacroAssembler::StackCheckFlag stack_check =
1273  RegExpMacroAssembler::kNoStackLimitCheck;
1274  if (pushes == push_limit) {
1275  stack_check = RegExpMacroAssembler::kCheckStackLimit;
1276  pushes = 0;
1277  }
1278 
1279  assembler->PushRegister(reg, stack_check);
1280  registers_to_pop->Set(reg, zone);
1281  } else if (undo_action == CLEAR) {
1282  registers_to_clear->Set(reg, zone);
1283  }
1284  // Perform the chronologically last action (or accumulated increment)
1285  // for the register.
1286  if (store_position != kNoStore) {
1287  assembler->WriteCurrentPositionToRegister(reg, store_position);
1288  } else if (clear) {
1289  assembler->ClearRegisters(reg, reg);
1290  } else if (absolute) {
1291  assembler->SetRegister(reg, value);
1292  } else if (value != 0) {
1293  assembler->AdvanceRegister(reg, value);
1294  }
1295  }
1296 }
1297 
1298 
1299 // This is called as we come into a loop choice node and some other tricky
1300 // nodes. It normalizes the state of the code generator to ensure we can
1301 // generate generic code.
1302 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1303  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1304 
1305  DCHECK(!is_trivial());
1306 
1307  if (actions_ == nullptr && backtrack() == nullptr) {
1308  // Here we just have some deferred cp advances to fix and we are back to
1309  // a normal situation. We may also have to forget some information gained
1310  // through a quick check that was already performed.
1311  if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1312  // Create a new trivial state and generate the node with that.
1313  Trace new_state;
1314  successor->Emit(compiler, &new_state);
1315  return;
1316  }
1317 
1318  // Generate deferred actions here along with code to undo them again.
1319  OutSet affected_registers;
1320 
1321  if (backtrack() != nullptr) {
1322  // Here we have a concrete backtrack location. These are set up by choice
1323  // nodes and so they indicate that we have a deferred save of the current
1324  // position which we may need to emit here.
1325  assembler->PushCurrentPosition();
1326  }
1327 
1328  int max_register = FindAffectedRegisters(&affected_registers,
1329  compiler->zone());
1330  OutSet registers_to_pop;
1331  OutSet registers_to_clear;
1332  PerformDeferredActions(assembler,
1333  max_register,
1334  affected_registers,
1335  &registers_to_pop,
1336  &registers_to_clear,
1337  compiler->zone());
1338  if (cp_offset_ != 0) {
1339  assembler->AdvanceCurrentPosition(cp_offset_);
1340  }
1341 
1342  // Create a new trivial state and generate the node with that.
1343  Label undo;
1344  assembler->PushBacktrack(&undo);
1345  if (successor->KeepRecursing(compiler)) {
1346  Trace new_state;
1347  successor->Emit(compiler, &new_state);
1348  } else {
1349  compiler->AddWork(successor);
1350  assembler->GoTo(successor->label());
1351  }
1352 
1353  // On backtrack we need to restore state.
1354  assembler->Bind(&undo);
1355  RestoreAffectedRegisters(assembler,
1356  max_register,
1357  registers_to_pop,
1358  registers_to_clear);
1359  if (backtrack() == nullptr) {
1360  assembler->Backtrack();
1361  } else {
1362  assembler->PopCurrentPosition();
1363  assembler->GoTo(backtrack());
1364  }
1365 }
1366 
1367 
1368 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1369  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1370 
1371  // Omit flushing the trace. We discard the entire stack frame anyway.
1372 
1373  if (!label()->is_bound()) {
1374  // We are completely independent of the trace, since we ignore it,
1375  // so this code can be used as the generic version.
1376  assembler->Bind(label());
1377  }
1378 
1379  // Throw away everything on the backtrack stack since the start
1380  // of the negative submatch and restore the character position.
1381  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1382  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1383  if (clear_capture_count_ > 0) {
1384  // Clear any captures that might have been performed during the success
1385  // of the body of the negative look-ahead.
1386  int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1387  assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1388  }
1389  // Now that we have unwound the stack we find at the top of the stack the
1390  // backtrack that the BeginSubmatch node got.
1391  assembler->Backtrack();
1392 }
1393 
1394 
1395 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1396  if (!trace->is_trivial()) {
1397  trace->Flush(compiler, this);
1398  return;
1399  }
1400  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1401  if (!label()->is_bound()) {
1402  assembler->Bind(label());
1403  }
1404  switch (action_) {
1405  case ACCEPT:
1406  assembler->Succeed();
1407  return;
1408  case BACKTRACK:
1409  assembler->GoTo(trace->backtrack());
1410  return;
1411  case NEGATIVE_SUBMATCH_SUCCESS:
1412  // This case is handled in a different virtual method.
1413  UNREACHABLE();
1414  }
1415  UNIMPLEMENTED();
1416 }
1417 
1418 
1419 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1420  if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1421  guards_->Add(guard, zone);
1422 }
1423 
1424 
1425 ActionNode* ActionNode::SetRegister(int reg,
1426  int val,
1427  RegExpNode* on_success) {
1428  ActionNode* result =
1429  new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1430  result->data_.u_store_register.reg = reg;
1431  result->data_.u_store_register.value = val;
1432  return result;
1433 }
1434 
1435 
1436 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1437  ActionNode* result =
1438  new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1439  result->data_.u_increment_register.reg = reg;
1440  return result;
1441 }
1442 
1443 
1444 ActionNode* ActionNode::StorePosition(int reg,
1445  bool is_capture,
1446  RegExpNode* on_success) {
1447  ActionNode* result =
1448  new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1449  result->data_.u_position_register.reg = reg;
1450  result->data_.u_position_register.is_capture = is_capture;
1451  return result;
1452 }
1453 
1454 
1455 ActionNode* ActionNode::ClearCaptures(Interval range,
1456  RegExpNode* on_success) {
1457  ActionNode* result =
1458  new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1459  result->data_.u_clear_captures.range_from = range.from();
1460  result->data_.u_clear_captures.range_to = range.to();
1461  return result;
1462 }
1463 
1464 
1465 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1466  int position_reg,
1467  RegExpNode* on_success) {
1468  ActionNode* result =
1469  new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1470  result->data_.u_submatch.stack_pointer_register = stack_reg;
1471  result->data_.u_submatch.current_position_register = position_reg;
1472  return result;
1473 }
1474 
1475 
1476 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1477  int position_reg,
1478  int clear_register_count,
1479  int clear_register_from,
1480  RegExpNode* on_success) {
1481  ActionNode* result =
1482  new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1483  result->data_.u_submatch.stack_pointer_register = stack_reg;
1484  result->data_.u_submatch.current_position_register = position_reg;
1485  result->data_.u_submatch.clear_register_count = clear_register_count;
1486  result->data_.u_submatch.clear_register_from = clear_register_from;
1487  return result;
1488 }
1489 
1490 
1491 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1492  int repetition_register,
1493  int repetition_limit,
1494  RegExpNode* on_success) {
1495  ActionNode* result =
1496  new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1497  result->data_.u_empty_match_check.start_register = start_register;
1498  result->data_.u_empty_match_check.repetition_register = repetition_register;
1499  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1500  return result;
1501 }
1502 
1503 
1504 #define DEFINE_ACCEPT(Type) \
1505  void Type##Node::Accept(NodeVisitor* visitor) { \
1506  visitor->Visit##Type(this); \
1507  }
1508 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1509 #undef DEFINE_ACCEPT
1510 
1511 
1512 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1513  visitor->VisitLoopChoice(this);
1514 }
1515 
1516 
1517 // -------------------------------------------------------------------
1518 // Emit code.
1519 
1520 
1521 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1522  Guard* guard,
1523  Trace* trace) {
1524  switch (guard->op()) {
1525  case Guard::LT:
1526  DCHECK(!trace->mentions_reg(guard->reg()));
1527  macro_assembler->IfRegisterGE(guard->reg(),
1528  guard->value(),
1529  trace->backtrack());
1530  break;
1531  case Guard::GEQ:
1532  DCHECK(!trace->mentions_reg(guard->reg()));
1533  macro_assembler->IfRegisterLT(guard->reg(),
1534  guard->value(),
1535  trace->backtrack());
1536  break;
1537  }
1538 }
1539 
1540 
1541 // Returns the number of characters in the equivalence class, omitting those
1542 // that cannot occur in the source string because it is Latin1.
1543 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1544  bool one_byte_subject,
1545  unibrow::uchar* letters) {
1546  int length =
1547  isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1548  // Unibrow returns 0 or 1 for characters where case independence is
1549  // trivial.
1550  if (length == 0) {
1551  letters[0] = character;
1552  length = 1;
1553  }
1554 
1555  if (one_byte_subject) {
1556  int new_length = 0;
1557  for (int i = 0; i < length; i++) {
1558  if (letters[i] <= String::kMaxOneByteCharCode) {
1559  letters[new_length++] = letters[i];
1560  }
1561  }
1562  length = new_length;
1563  }
1564 
1565  return length;
1566 }
1567 
1568 
1569 static inline bool EmitSimpleCharacter(Isolate* isolate,
1570  RegExpCompiler* compiler,
1571  uc16 c,
1572  Label* on_failure,
1573  int cp_offset,
1574  bool check,
1575  bool preloaded) {
1576  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1577  bool bound_checked = false;
1578  if (!preloaded) {
1579  assembler->LoadCurrentCharacter(
1580  cp_offset,
1581  on_failure,
1582  check);
1583  bound_checked = true;
1584  }
1585  assembler->CheckNotCharacter(c, on_failure);
1586  return bound_checked;
1587 }
1588 
1589 
1590 // Only emits non-letters (things that don't have case). Only used for case
1591 // independent matches.
1592 static inline bool EmitAtomNonLetter(Isolate* isolate,
1593  RegExpCompiler* compiler,
1594  uc16 c,
1595  Label* on_failure,
1596  int cp_offset,
1597  bool check,
1598  bool preloaded) {
1599  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1600  bool one_byte = compiler->one_byte();
1601  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1602  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1603  if (length < 1) {
1604  // This can't match. Must be an one-byte subject and a non-one-byte
1605  // character. We do not need to do anything since the one-byte pass
1606  // already handled this.
1607  return false; // Bounds not checked.
1608  }
1609  bool checked = false;
1610  // We handle the length > 1 case in a later pass.
1611  if (length == 1) {
1612  if (one_byte && c > String::kMaxOneByteCharCodeU) {
1613  // Can't match - see above.
1614  return false; // Bounds not checked.
1615  }
1616  if (!preloaded) {
1617  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1618  checked = check;
1619  }
1620  macro_assembler->CheckNotCharacter(c, on_failure);
1621  }
1622  return checked;
1623 }
1624 
1625 
1626 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1627  bool one_byte, uc16 c1, uc16 c2,
1628  Label* on_failure) {
1629  uc16 char_mask;
1630  if (one_byte) {
1631  char_mask = String::kMaxOneByteCharCode;
1632  } else {
1633  char_mask = String::kMaxUtf16CodeUnit;
1634  }
1635  uc16 exor = c1 ^ c2;
1636  // Check whether exor has only one bit set.
1637  if (((exor - 1) & exor) == 0) {
1638  // If c1 and c2 differ only by one bit.
1639  // Ecma262UnCanonicalize always gives the highest number last.
1640  DCHECK(c2 > c1);
1641  uc16 mask = char_mask ^ exor;
1642  macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1643  return true;
1644  }
1645  DCHECK(c2 > c1);
1646  uc16 diff = c2 - c1;
1647  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1648  // If the characters differ by 2^n but don't differ by one bit then
1649  // subtract the difference from the found character, then do the or
1650  // trick. We avoid the theoretical case where negative numbers are
1651  // involved in order to simplify code generation.
1652  uc16 mask = char_mask ^ diff;
1653  macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1654  diff,
1655  mask,
1656  on_failure);
1657  return true;
1658  }
1659  return false;
1660 }
1661 
1662 
1663 typedef bool EmitCharacterFunction(Isolate* isolate,
1664  RegExpCompiler* compiler,
1665  uc16 c,
1666  Label* on_failure,
1667  int cp_offset,
1668  bool check,
1669  bool preloaded);
1670 
1671 // Only emits letters (things that have case). Only used for case independent
1672 // matches.
1673 static inline bool EmitAtomLetter(Isolate* isolate,
1674  RegExpCompiler* compiler,
1675  uc16 c,
1676  Label* on_failure,
1677  int cp_offset,
1678  bool check,
1679  bool preloaded) {
1680  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1681  bool one_byte = compiler->one_byte();
1682  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1683  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1684  if (length <= 1) return false;
1685  // We may not need to check against the end of the input string
1686  // if this character lies before a character that matched.
1687  if (!preloaded) {
1688  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1689  }
1690  Label ok;
1691  DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1692  switch (length) {
1693  case 2: {
1694  if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1695  chars[1], on_failure)) {
1696  } else {
1697  macro_assembler->CheckCharacter(chars[0], &ok);
1698  macro_assembler->CheckNotCharacter(chars[1], on_failure);
1699  macro_assembler->Bind(&ok);
1700  }
1701  break;
1702  }
1703  case 4:
1704  macro_assembler->CheckCharacter(chars[3], &ok);
1705  V8_FALLTHROUGH;
1706  case 3:
1707  macro_assembler->CheckCharacter(chars[0], &ok);
1708  macro_assembler->CheckCharacter(chars[1], &ok);
1709  macro_assembler->CheckNotCharacter(chars[2], on_failure);
1710  macro_assembler->Bind(&ok);
1711  break;
1712  default:
1713  UNREACHABLE();
1714  break;
1715  }
1716  return true;
1717 }
1718 
1719 
1720 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1721  int border,
1722  Label* fall_through,
1723  Label* above_or_equal,
1724  Label* below) {
1725  if (below != fall_through) {
1726  masm->CheckCharacterLT(border, below);
1727  if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1728  } else {
1729  masm->CheckCharacterGT(border - 1, above_or_equal);
1730  }
1731 }
1732 
1733 
1734 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1735  int first,
1736  int last,
1737  Label* fall_through,
1738  Label* in_range,
1739  Label* out_of_range) {
1740  if (in_range == fall_through) {
1741  if (first == last) {
1742  masm->CheckNotCharacter(first, out_of_range);
1743  } else {
1744  masm->CheckCharacterNotInRange(first, last, out_of_range);
1745  }
1746  } else {
1747  if (first == last) {
1748  masm->CheckCharacter(first, in_range);
1749  } else {
1750  masm->CheckCharacterInRange(first, last, in_range);
1751  }
1752  if (out_of_range != fall_through) masm->GoTo(out_of_range);
1753  }
1754 }
1755 
1756 
1757 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1758 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1759 static void EmitUseLookupTable(
1760  RegExpMacroAssembler* masm,
1761  ZoneList<int>* ranges,
1762  int start_index,
1763  int end_index,
1764  int min_char,
1765  Label* fall_through,
1766  Label* even_label,
1767  Label* odd_label) {
1768  static const int kSize = RegExpMacroAssembler::kTableSize;
1769  static const int kMask = RegExpMacroAssembler::kTableMask;
1770 
1771  int base = (min_char & ~kMask);
1772  USE(base);
1773 
1774  // Assert that everything is on one kTableSize page.
1775  for (int i = start_index; i <= end_index; i++) {
1776  DCHECK_EQ(ranges->at(i) & ~kMask, base);
1777  }
1778  DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1779 
1780  char templ[kSize];
1781  Label* on_bit_set;
1782  Label* on_bit_clear;
1783  int bit;
1784  if (even_label == fall_through) {
1785  on_bit_set = odd_label;
1786  on_bit_clear = even_label;
1787  bit = 1;
1788  } else {
1789  on_bit_set = even_label;
1790  on_bit_clear = odd_label;
1791  bit = 0;
1792  }
1793  for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1794  templ[i] = bit;
1795  }
1796  int j = 0;
1797  bit ^= 1;
1798  for (int i = start_index; i < end_index; i++) {
1799  for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1800  templ[j] = bit;
1801  }
1802  bit ^= 1;
1803  }
1804  for (int i = j; i < kSize; i++) {
1805  templ[i] = bit;
1806  }
1807  Factory* factory = masm->isolate()->factory();
1808  // TODO(erikcorry): Cache these.
1809  Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1810  for (int i = 0; i < kSize; i++) {
1811  ba->set(i, templ[i]);
1812  }
1813  masm->CheckBitInTable(ba, on_bit_set);
1814  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1815 }
1816 
1817 
1818 static void CutOutRange(RegExpMacroAssembler* masm,
1819  ZoneList<int>* ranges,
1820  int start_index,
1821  int end_index,
1822  int cut_index,
1823  Label* even_label,
1824  Label* odd_label) {
1825  bool odd = (((cut_index - start_index) & 1) == 1);
1826  Label* in_range_label = odd ? odd_label : even_label;
1827  Label dummy;
1828  EmitDoubleBoundaryTest(masm,
1829  ranges->at(cut_index),
1830  ranges->at(cut_index + 1) - 1,
1831  &dummy,
1832  in_range_label,
1833  &dummy);
1834  DCHECK(!dummy.is_linked());
1835  // Cut out the single range by rewriting the array. This creates a new
1836  // range that is a merger of the two ranges on either side of the one we
1837  // are cutting out. The oddity of the labels is preserved.
1838  for (int j = cut_index; j > start_index; j--) {
1839  ranges->at(j) = ranges->at(j - 1);
1840  }
1841  for (int j = cut_index + 1; j < end_index; j++) {
1842  ranges->at(j) = ranges->at(j + 1);
1843  }
1844 }
1845 
1846 
1847 // Unicode case. Split the search space into kSize spaces that are handled
1848 // with recursion.
1849 static void SplitSearchSpace(ZoneList<int>* ranges,
1850  int start_index,
1851  int end_index,
1852  int* new_start_index,
1853  int* new_end_index,
1854  int* border) {
1855  static const int kSize = RegExpMacroAssembler::kTableSize;
1856  static const int kMask = RegExpMacroAssembler::kTableMask;
1857 
1858  int first = ranges->at(start_index);
1859  int last = ranges->at(end_index) - 1;
1860 
1861  *new_start_index = start_index;
1862  *border = (ranges->at(start_index) & ~kMask) + kSize;
1863  while (*new_start_index < end_index) {
1864  if (ranges->at(*new_start_index) > *border) break;
1865  (*new_start_index)++;
1866  }
1867  // new_start_index is the index of the first edge that is beyond the
1868  // current kSize space.
1869 
1870  // For very large search spaces we do a binary chop search of the non-Latin1
1871  // space instead of just going to the end of the current kSize space. The
1872  // heuristics are complicated a little by the fact that any 128-character
1873  // encoding space can be quickly tested with a table lookup, so we don't
1874  // wish to do binary chop search at a smaller granularity than that. A
1875  // 128-character space can take up a lot of space in the ranges array if,
1876  // for example, we only want to match every second character (eg. the lower
1877  // case characters on some Unicode pages).
1878  int binary_chop_index = (end_index + start_index) / 2;
1879  // The first test ensures that we get to the code that handles the Latin1
1880  // range with a single not-taken branch, speeding up this important
1881  // character range (even non-Latin1 charset-based text has spaces and
1882  // punctuation).
1883  if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1884  end_index - start_index > (*new_start_index - start_index) * 2 &&
1885  last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1886  ranges->at(binary_chop_index) >= first + 2 * kSize) {
1887  int scan_forward_for_section_border = binary_chop_index;;
1888  int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1889 
1890  while (scan_forward_for_section_border < end_index) {
1891  if (ranges->at(scan_forward_for_section_border) > new_border) {
1892  *new_start_index = scan_forward_for_section_border;
1893  *border = new_border;
1894  break;
1895  }
1896  scan_forward_for_section_border++;
1897  }
1898  }
1899 
1900  DCHECK(*new_start_index > start_index);
1901  *new_end_index = *new_start_index - 1;
1902  if (ranges->at(*new_end_index) == *border) {
1903  (*new_end_index)--;
1904  }
1905  if (*border >= ranges->at(end_index)) {
1906  *border = ranges->at(end_index);
1907  *new_start_index = end_index; // Won't be used.
1908  *new_end_index = end_index - 1;
1909  }
1910 }
1911 
1912 // Gets a series of segment boundaries representing a character class. If the
1913 // character is in the range between an even and an odd boundary (counting from
1914 // start_index) then go to even_label, otherwise go to odd_label. We already
1915 // know that the character is in the range of min_char to max_char inclusive.
1916 // Either label can be nullptr indicating backtracking. Either label can also
1917 // be equal to the fall_through label.
1918 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1919  int start_index, int end_index, uc32 min_char,
1920  uc32 max_char, Label* fall_through,
1921  Label* even_label, Label* odd_label) {
1922  DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1923  DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1924 
1925  int first = ranges->at(start_index);
1926  int last = ranges->at(end_index) - 1;
1927 
1928  DCHECK_LT(min_char, first);
1929 
1930  // Just need to test if the character is before or on-or-after
1931  // a particular character.
1932  if (start_index == end_index) {
1933  EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1934  return;
1935  }
1936 
1937  // Another almost trivial case: There is one interval in the middle that is
1938  // different from the end intervals.
1939  if (start_index + 1 == end_index) {
1940  EmitDoubleBoundaryTest(
1941  masm, first, last, fall_through, even_label, odd_label);
1942  return;
1943  }
1944 
1945  // It's not worth using table lookup if there are very few intervals in the
1946  // character class.
1947  if (end_index - start_index <= 6) {
1948  // It is faster to test for individual characters, so we look for those
1949  // first, then try arbitrary ranges in the second round.
1950  static int kNoCutIndex = -1;
1951  int cut = kNoCutIndex;
1952  for (int i = start_index; i < end_index; i++) {
1953  if (ranges->at(i) == ranges->at(i + 1) - 1) {
1954  cut = i;
1955  break;
1956  }
1957  }
1958  if (cut == kNoCutIndex) cut = start_index;
1959  CutOutRange(
1960  masm, ranges, start_index, end_index, cut, even_label, odd_label);
1961  DCHECK_GE(end_index - start_index, 2);
1962  GenerateBranches(masm,
1963  ranges,
1964  start_index + 1,
1965  end_index - 1,
1966  min_char,
1967  max_char,
1968  fall_through,
1969  even_label,
1970  odd_label);
1971  return;
1972  }
1973 
1974  // If there are a lot of intervals in the regexp, then we will use tables to
1975  // determine whether the character is inside or outside the character class.
1976  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1977 
1978  if ((max_char >> kBits) == (min_char >> kBits)) {
1979  EmitUseLookupTable(masm,
1980  ranges,
1981  start_index,
1982  end_index,
1983  min_char,
1984  fall_through,
1985  even_label,
1986  odd_label);
1987  return;
1988  }
1989 
1990  if ((min_char >> kBits) != (first >> kBits)) {
1991  masm->CheckCharacterLT(first, odd_label);
1992  GenerateBranches(masm,
1993  ranges,
1994  start_index + 1,
1995  end_index,
1996  first,
1997  max_char,
1998  fall_through,
1999  odd_label,
2000  even_label);
2001  return;
2002  }
2003 
2004  int new_start_index = 0;
2005  int new_end_index = 0;
2006  int border = 0;
2007 
2008  SplitSearchSpace(ranges,
2009  start_index,
2010  end_index,
2011  &new_start_index,
2012  &new_end_index,
2013  &border);
2014 
2015  Label handle_rest;
2016  Label* above = &handle_rest;
2017  if (border == last + 1) {
2018  // We didn't find any section that started after the limit, so everything
2019  // above the border is one of the terminal labels.
2020  above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2021  DCHECK(new_end_index == end_index - 1);
2022  }
2023 
2024  DCHECK_LE(start_index, new_end_index);
2025  DCHECK_LE(new_start_index, end_index);
2026  DCHECK_LT(start_index, new_start_index);
2027  DCHECK_LT(new_end_index, end_index);
2028  DCHECK(new_end_index + 1 == new_start_index ||
2029  (new_end_index + 2 == new_start_index &&
2030  border == ranges->at(new_end_index + 1)));
2031  DCHECK_LT(min_char, border - 1);
2032  DCHECK_LT(border, max_char);
2033  DCHECK_LT(ranges->at(new_end_index), border);
2034  DCHECK(border < ranges->at(new_start_index) ||
2035  (border == ranges->at(new_start_index) &&
2036  new_start_index == end_index &&
2037  new_end_index == end_index - 1 &&
2038  border == last + 1));
2039  DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2040 
2041  masm->CheckCharacterGT(border - 1, above);
2042  Label dummy;
2043  GenerateBranches(masm,
2044  ranges,
2045  start_index,
2046  new_end_index,
2047  min_char,
2048  border - 1,
2049  &dummy,
2050  even_label,
2051  odd_label);
2052  if (handle_rest.is_linked()) {
2053  masm->Bind(&handle_rest);
2054  bool flip = (new_start_index & 1) != (start_index & 1);
2055  GenerateBranches(masm,
2056  ranges,
2057  new_start_index,
2058  end_index,
2059  border,
2060  max_char,
2061  &dummy,
2062  flip ? odd_label : even_label,
2063  flip ? even_label : odd_label);
2064  }
2065 }
2066 
2067 
2068 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2069  RegExpCharacterClass* cc, bool one_byte,
2070  Label* on_failure, int cp_offset, bool check_offset,
2071  bool preloaded, Zone* zone) {
2072  ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2073  CharacterRange::Canonicalize(ranges);
2074 
2075  int max_char;
2076  if (one_byte) {
2077  max_char = String::kMaxOneByteCharCode;
2078  } else {
2079  max_char = String::kMaxUtf16CodeUnit;
2080  }
2081 
2082  int range_count = ranges->length();
2083 
2084  int last_valid_range = range_count - 1;
2085  while (last_valid_range >= 0) {
2086  CharacterRange& range = ranges->at(last_valid_range);
2087  if (range.from() <= max_char) {
2088  break;
2089  }
2090  last_valid_range--;
2091  }
2092 
2093  if (last_valid_range < 0) {
2094  if (!cc->is_negated()) {
2095  macro_assembler->GoTo(on_failure);
2096  }
2097  if (check_offset) {
2098  macro_assembler->CheckPosition(cp_offset, on_failure);
2099  }
2100  return;
2101  }
2102 
2103  if (last_valid_range == 0 &&
2104  ranges->at(0).IsEverything(max_char)) {
2105  if (cc->is_negated()) {
2106  macro_assembler->GoTo(on_failure);
2107  } else {
2108  // This is a common case hit by non-anchored expressions.
2109  if (check_offset) {
2110  macro_assembler->CheckPosition(cp_offset, on_failure);
2111  }
2112  }
2113  return;
2114  }
2115 
2116  if (!preloaded) {
2117  macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2118  }
2119 
2120  if (cc->is_standard(zone) &&
2121  macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2122  on_failure)) {
2123  return;
2124  }
2125 
2126 
2127  // A new list with ascending entries. Each entry is a code unit
2128  // where there is a boundary between code units that are part of
2129  // the class and code units that are not. Normally we insert an
2130  // entry at zero which goes to the failure label, but if there
2131  // was already one there we fall through for success on that entry.
2132  // Subsequent entries have alternating meaning (success/failure).
2133  ZoneList<int>* range_boundaries =
2134  new(zone) ZoneList<int>(last_valid_range, zone);
2135 
2136  bool zeroth_entry_is_failure = !cc->is_negated();
2137 
2138  for (int i = 0; i <= last_valid_range; i++) {
2139  CharacterRange& range = ranges->at(i);
2140  if (range.from() == 0) {
2141  DCHECK_EQ(i, 0);
2142  zeroth_entry_is_failure = !zeroth_entry_is_failure;
2143  } else {
2144  range_boundaries->Add(range.from(), zone);
2145  }
2146  range_boundaries->Add(range.to() + 1, zone);
2147  }
2148  int end_index = range_boundaries->length() - 1;
2149  if (range_boundaries->at(end_index) > max_char) {
2150  end_index--;
2151  }
2152 
2153  Label fall_through;
2154  GenerateBranches(macro_assembler,
2155  range_boundaries,
2156  0, // start_index.
2157  end_index,
2158  0, // min_char.
2159  max_char,
2160  &fall_through,
2161  zeroth_entry_is_failure ? &fall_through : on_failure,
2162  zeroth_entry_is_failure ? on_failure : &fall_through);
2163  macro_assembler->Bind(&fall_through);
2164 }
2165 
2166 RegExpNode::~RegExpNode() = default;
2167 
2168 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2169  Trace* trace) {
2170  // If we are generating a greedy loop then don't stop and don't reuse code.
2171  if (trace->stop_node() != nullptr) {
2172  return CONTINUE;
2173  }
2174 
2175  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2176  if (trace->is_trivial()) {
2177  if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2178  // If a generic version is already scheduled to be generated or we have
2179  // recursed too deeply then just generate a jump to that code.
2180  macro_assembler->GoTo(&label_);
2181  // This will queue it up for generation of a generic version if it hasn't
2182  // already been queued.
2183  compiler->AddWork(this);
2184  return DONE;
2185  }
2186  // Generate generic version of the node and bind the label for later use.
2187  macro_assembler->Bind(&label_);
2188  return CONTINUE;
2189  }
2190 
2191  // We are being asked to make a non-generic version. Keep track of how many
2192  // non-generic versions we generate so as not to overdo it.
2193  trace_count_++;
2194  if (KeepRecursing(compiler) && compiler->optimize() &&
2195  trace_count_ < kMaxCopiesCodeGenerated) {
2196  return CONTINUE;
2197  }
2198 
2199  // If we get here code has been generated for this node too many times or
2200  // recursion is too deep. Time to switch to a generic version. The code for
2201  // generic versions above can handle deep recursion properly.
2202  bool was_limiting = compiler->limiting_recursion();
2203  compiler->set_limiting_recursion(true);
2204  trace->Flush(compiler, this);
2205  compiler->set_limiting_recursion(was_limiting);
2206  return DONE;
2207 }
2208 
2209 
2210 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2211  return !compiler->limiting_recursion() &&
2212  compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2213 }
2214 
2215 
2216 int ActionNode::EatsAtLeast(int still_to_find,
2217  int budget,
2218  bool not_at_start) {
2219  if (budget <= 0) return 0;
2220  if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2221  return on_success()->EatsAtLeast(still_to_find,
2222  budget - 1,
2223  not_at_start);
2224 }
2225 
2226 
2227 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2228  BoyerMooreLookahead* bm, bool not_at_start) {
2229  if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2230  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2231  }
2232  SaveBMInfo(bm, not_at_start, offset);
2233 }
2234 
2235 
2236 int AssertionNode::EatsAtLeast(int still_to_find,
2237  int budget,
2238  bool not_at_start) {
2239  if (budget <= 0) return 0;
2240  // If we know we are not at the start and we are asked "how many characters
2241  // will you match if you succeed?" then we can answer anything since false
2242  // implies false. So lets just return the max answer (still_to_find) since
2243  // that won't prevent us from preloading a lot of characters for the other
2244  // branches in the node graph.
2245  if (assertion_type() == AT_START && not_at_start) return still_to_find;
2246  return on_success()->EatsAtLeast(still_to_find,
2247  budget - 1,
2248  not_at_start);
2249 }
2250 
2251 
2252 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2253  BoyerMooreLookahead* bm, bool not_at_start) {
2254  // Match the behaviour of EatsAtLeast on this node.
2255  if (assertion_type() == AT_START && not_at_start) return;
2256  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2257  SaveBMInfo(bm, not_at_start, offset);
2258 }
2259 
2260 
2261 int BackReferenceNode::EatsAtLeast(int still_to_find,
2262  int budget,
2263  bool not_at_start) {
2264  if (read_backward()) return 0;
2265  if (budget <= 0) return 0;
2266  return on_success()->EatsAtLeast(still_to_find,
2267  budget - 1,
2268  not_at_start);
2269 }
2270 
2271 
2272 int TextNode::EatsAtLeast(int still_to_find,
2273  int budget,
2274  bool not_at_start) {
2275  if (read_backward()) return 0;
2276  int answer = Length();
2277  if (answer >= still_to_find) return answer;
2278  if (budget <= 0) return answer;
2279  // We are not at start after this node so we set the last argument to 'true'.
2280  return answer + on_success()->EatsAtLeast(still_to_find - answer,
2281  budget - 1,
2282  true);
2283 }
2284 
2285 
2286 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2287  bool not_at_start) {
2288  if (budget <= 0) return 0;
2289  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2290  // afterwards.
2291  RegExpNode* node = alternatives_->at(1).node();
2292  return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2293 }
2294 
2295 
2296 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2297  QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2298  bool not_at_start) {
2299  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2300  // afterwards.
2301  RegExpNode* node = alternatives_->at(1).node();
2302  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2303 }
2304 
2305 
2306 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2307  int budget,
2308  RegExpNode* ignore_this_node,
2309  bool not_at_start) {
2310  if (budget <= 0) return 0;
2311  int min = 100;
2312  int choice_count = alternatives_->length();
2313  budget = (budget - 1) / choice_count;
2314  for (int i = 0; i < choice_count; i++) {
2315  RegExpNode* node = alternatives_->at(i).node();
2316  if (node == ignore_this_node) continue;
2317  int node_eats_at_least =
2318  node->EatsAtLeast(still_to_find, budget, not_at_start);
2319  if (node_eats_at_least < min) min = node_eats_at_least;
2320  if (min == 0) return 0;
2321  }
2322  return min;
2323 }
2324 
2325 
2326 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2327  int budget,
2328  bool not_at_start) {
2329  return EatsAtLeastHelper(still_to_find,
2330  budget - 1,
2331  loop_node_,
2332  not_at_start);
2333 }
2334 
2335 
2336 int ChoiceNode::EatsAtLeast(int still_to_find,
2337  int budget,
2338  bool not_at_start) {
2339  return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2340 }
2341 
2342 
2343 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2344 static inline uint32_t SmearBitsRight(uint32_t v) {
2345  v |= v >> 1;
2346  v |= v >> 2;
2347  v |= v >> 4;
2348  v |= v >> 8;
2349  v |= v >> 16;
2350  return v;
2351 }
2352 
2353 
2354 bool QuickCheckDetails::Rationalize(bool asc) {
2355  bool found_useful_op = false;
2356  uint32_t char_mask;
2357  if (asc) {
2358  char_mask = String::kMaxOneByteCharCode;
2359  } else {
2360  char_mask = String::kMaxUtf16CodeUnit;
2361  }
2362  mask_ = 0;
2363  value_ = 0;
2364  int char_shift = 0;
2365  for (int i = 0; i < characters_; i++) {
2366  Position* pos = &positions_[i];
2367  if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2368  found_useful_op = true;
2369  }
2370  mask_ |= (pos->mask & char_mask) << char_shift;
2371  value_ |= (pos->value & char_mask) << char_shift;
2372  char_shift += asc ? 8 : 16;
2373  }
2374  return found_useful_op;
2375 }
2376 
2377 
2378 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2379  Trace* bounds_check_trace,
2380  Trace* trace,
2381  bool preload_has_checked_bounds,
2382  Label* on_possible_success,
2383  QuickCheckDetails* details,
2384  bool fall_through_on_failure) {
2385  if (details->characters() == 0) return false;
2386  GetQuickCheckDetails(
2387  details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2388  if (details->cannot_match()) return false;
2389  if (!details->Rationalize(compiler->one_byte())) return false;
2390  DCHECK(details->characters() == 1 ||
2391  compiler->macro_assembler()->CanReadUnaligned());
2392  uint32_t mask = details->mask();
2393  uint32_t value = details->value();
2394 
2395  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2396 
2397  if (trace->characters_preloaded() != details->characters()) {
2398  DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2399  // We are attempting to preload the minimum number of characters
2400  // any choice would eat, so if the bounds check fails, then none of the
2401  // choices can succeed, so we can just immediately backtrack, rather
2402  // than go to the next choice.
2403  assembler->LoadCurrentCharacter(trace->cp_offset(),
2404  bounds_check_trace->backtrack(),
2405  !preload_has_checked_bounds,
2406  details->characters());
2407  }
2408 
2409 
2410  bool need_mask = true;
2411 
2412  if (details->characters() == 1) {
2413  // If number of characters preloaded is 1 then we used a byte or 16 bit
2414  // load so the value is already masked down.
2415  uint32_t char_mask;
2416  if (compiler->one_byte()) {
2417  char_mask = String::kMaxOneByteCharCode;
2418  } else {
2419  char_mask = String::kMaxUtf16CodeUnit;
2420  }
2421  if ((mask & char_mask) == char_mask) need_mask = false;
2422  mask &= char_mask;
2423  } else {
2424  // For 2-character preloads in one-byte mode or 1-character preloads in
2425  // two-byte mode we also use a 16 bit load with zero extend.
2426  static const uint32_t kTwoByteMask = 0xFFFF;
2427  static const uint32_t kFourByteMask = 0xFFFFFFFF;
2428  if (details->characters() == 2 && compiler->one_byte()) {
2429  if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2430  } else if (details->characters() == 1 && !compiler->one_byte()) {
2431  if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2432  } else {
2433  if (mask == kFourByteMask) need_mask = false;
2434  }
2435  }
2436 
2437  if (fall_through_on_failure) {
2438  if (need_mask) {
2439  assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2440  } else {
2441  assembler->CheckCharacter(value, on_possible_success);
2442  }
2443  } else {
2444  if (need_mask) {
2445  assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2446  } else {
2447  assembler->CheckNotCharacter(value, trace->backtrack());
2448  }
2449  }
2450  return true;
2451 }
2452 
2453 
2454 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2455 // super-class in the .h file).
2456 //
2457 // We iterate along the text object, building up for each character a
2458 // mask and value that can be used to test for a quick failure to match.
2459 // The masks and values for the positions will be combined into a single
2460 // machine word for the current character width in order to be used in
2461 // generating a quick check.
2462 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2463  RegExpCompiler* compiler,
2464  int characters_filled_in,
2465  bool not_at_start) {
2466  // Do not collect any quick check details if the text node reads backward,
2467  // since it reads in the opposite direction than we use for quick checks.
2468  if (read_backward()) return;
2469  Isolate* isolate = compiler->macro_assembler()->isolate();
2470  DCHECK(characters_filled_in < details->characters());
2471  int characters = details->characters();
2472  int char_mask;
2473  if (compiler->one_byte()) {
2474  char_mask = String::kMaxOneByteCharCode;
2475  } else {
2476  char_mask = String::kMaxUtf16CodeUnit;
2477  }
2478  for (int k = 0; k < elements()->length(); k++) {
2479  TextElement elm = elements()->at(k);
2480  if (elm.text_type() == TextElement::ATOM) {
2481  Vector<const uc16> quarks = elm.atom()->data();
2482  for (int i = 0; i < characters && i < quarks.length(); i++) {
2483  QuickCheckDetails::Position* pos =
2484  details->positions(characters_filled_in);
2485  uc16 c = quarks[i];
2486  if (elm.atom()->ignore_case()) {
2487  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2488  int length = GetCaseIndependentLetters(isolate, c,
2489  compiler->one_byte(), chars);
2490  if (length == 0) {
2491  // This can happen because all case variants are non-Latin1, but we
2492  // know the input is Latin1.
2493  details->set_cannot_match();
2494  pos->determines_perfectly = false;
2495  return;
2496  }
2497  if (length == 1) {
2498  // This letter has no case equivalents, so it's nice and simple
2499  // and the mask-compare will determine definitely whether we have
2500  // a match at this character position.
2501  pos->mask = char_mask;
2502  pos->value = c;
2503  pos->determines_perfectly = true;
2504  } else {
2505  uint32_t common_bits = char_mask;
2506  uint32_t bits = chars[0];
2507  for (int j = 1; j < length; j++) {
2508  uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2509  common_bits ^= differing_bits;
2510  bits &= common_bits;
2511  }
2512  // If length is 2 and common bits has only one zero in it then
2513  // our mask and compare instruction will determine definitely
2514  // whether we have a match at this character position. Otherwise
2515  // it can only be an approximate check.
2516  uint32_t one_zero = (common_bits | ~char_mask);
2517  if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2518  pos->determines_perfectly = true;
2519  }
2520  pos->mask = common_bits;
2521  pos->value = bits;
2522  }
2523  } else {
2524  // Don't ignore case. Nice simple case where the mask-compare will
2525  // determine definitely whether we have a match at this character
2526  // position.
2527  if (c > char_mask) {
2528  details->set_cannot_match();
2529  pos->determines_perfectly = false;
2530  return;
2531  }
2532  pos->mask = char_mask;
2533  pos->value = c;
2534  pos->determines_perfectly = true;
2535  }
2536  characters_filled_in++;
2537  DCHECK(characters_filled_in <= details->characters());
2538  if (characters_filled_in == details->characters()) {
2539  return;
2540  }
2541  }
2542  } else {
2543  QuickCheckDetails::Position* pos =
2544  details->positions(characters_filled_in);
2545  RegExpCharacterClass* tree = elm.char_class();
2546  ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2547  DCHECK(!ranges->is_empty());
2548  if (tree->is_negated()) {
2549  // A quick check uses multi-character mask and compare. There is no
2550  // useful way to incorporate a negative char class into this scheme
2551  // so we just conservatively create a mask and value that will always
2552  // succeed.
2553  pos->mask = 0;
2554  pos->value = 0;
2555  } else {
2556  int first_range = 0;
2557  while (ranges->at(first_range).from() > char_mask) {
2558  first_range++;
2559  if (first_range == ranges->length()) {
2560  details->set_cannot_match();
2561  pos->determines_perfectly = false;
2562  return;
2563  }
2564  }
2565  CharacterRange range = ranges->at(first_range);
2566  uc16 from = range.from();
2567  uc16 to = range.to();
2568  if (to > char_mask) {
2569  to = char_mask;
2570  }
2571  uint32_t differing_bits = (from ^ to);
2572  // A mask and compare is only perfect if the differing bits form a
2573  // number like 00011111 with one single block of trailing 1s.
2574  if ((differing_bits & (differing_bits + 1)) == 0 &&
2575  from + differing_bits == to) {
2576  pos->determines_perfectly = true;
2577  }
2578  uint32_t common_bits = ~SmearBitsRight(differing_bits);
2579  uint32_t bits = (from & common_bits);
2580  for (int i = first_range + 1; i < ranges->length(); i++) {
2581  CharacterRange range = ranges->at(i);
2582  uc16 from = range.from();
2583  uc16 to = range.to();
2584  if (from > char_mask) continue;
2585  if (to > char_mask) to = char_mask;
2586  // Here we are combining more ranges into the mask and compare
2587  // value. With each new range the mask becomes more sparse and
2588  // so the chances of a false positive rise. A character class
2589  // with multiple ranges is assumed never to be equivalent to a
2590  // mask and compare operation.
2591  pos->determines_perfectly = false;
2592  uint32_t new_common_bits = (from ^ to);
2593  new_common_bits = ~SmearBitsRight(new_common_bits);
2594  common_bits &= new_common_bits;
2595  bits &= new_common_bits;
2596  uint32_t differing_bits = (from & common_bits) ^ bits;
2597  common_bits ^= differing_bits;
2598  bits &= common_bits;
2599  }
2600  pos->mask = common_bits;
2601  pos->value = bits;
2602  }
2603  characters_filled_in++;
2604  DCHECK(characters_filled_in <= details->characters());
2605  if (characters_filled_in == details->characters()) {
2606  return;
2607  }
2608  }
2609  }
2610  DCHECK(characters_filled_in != details->characters());
2611  if (!details->cannot_match()) {
2612  on_success()-> GetQuickCheckDetails(details,
2613  compiler,
2614  characters_filled_in,
2615  true);
2616  }
2617 }
2618 
2619 
2620 void QuickCheckDetails::Clear() {
2621  for (int i = 0; i < characters_; i++) {
2622  positions_[i].mask = 0;
2623  positions_[i].value = 0;
2624  positions_[i].determines_perfectly = false;
2625  }
2626  characters_ = 0;
2627 }
2628 
2629 
2630 void QuickCheckDetails::Advance(int by, bool one_byte) {
2631  if (by >= characters_ || by < 0) {
2632  DCHECK_IMPLIES(by < 0, characters_ == 0);
2633  Clear();
2634  return;
2635  }
2636  DCHECK_LE(characters_ - by, 4);
2637  DCHECK_LE(characters_, 4);
2638  for (int i = 0; i < characters_ - by; i++) {
2639  positions_[i] = positions_[by + i];
2640  }
2641  for (int i = characters_ - by; i < characters_; i++) {
2642  positions_[i].mask = 0;
2643  positions_[i].value = 0;
2644  positions_[i].determines_perfectly = false;
2645  }
2646  characters_ -= by;
2647  // We could change mask_ and value_ here but we would never advance unless
2648  // they had already been used in a check and they won't be used again because
2649  // it would gain us nothing. So there's no point.
2650 }
2651 
2652 
2653 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2654  DCHECK(characters_ == other->characters_);
2655  if (other->cannot_match_) {
2656  return;
2657  }
2658  if (cannot_match_) {
2659  *this = *other;
2660  return;
2661  }
2662  for (int i = from_index; i < characters_; i++) {
2663  QuickCheckDetails::Position* pos = positions(i);
2664  QuickCheckDetails::Position* other_pos = other->positions(i);
2665  if (pos->mask != other_pos->mask ||
2666  pos->value != other_pos->value ||
2667  !other_pos->determines_perfectly) {
2668  // Our mask-compare operation will be approximate unless we have the
2669  // exact same operation on both sides of the alternation.
2670  pos->determines_perfectly = false;
2671  }
2672  pos->mask &= other_pos->mask;
2673  pos->value &= pos->mask;
2674  other_pos->value &= pos->mask;
2675  uc16 differing_bits = (pos->value ^ other_pos->value);
2676  pos->mask &= ~differing_bits;
2677  pos->value &= pos->mask;
2678  }
2679 }
2680 
2681 
2683  public:
2684  explicit VisitMarker(NodeInfo* info) : info_(info) {
2685  DCHECK(!info->visited);
2686  info->visited = true;
2687  }
2688  ~VisitMarker() {
2689  info_->visited = false;
2690  }
2691  private:
2692  NodeInfo* info_;
2693 };
2694 
2695 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2696  if (info()->replacement_calculated) return replacement();
2697  if (depth < 0) return this;
2698  DCHECK(!info()->visited);
2699  VisitMarker marker(info());
2700  return FilterSuccessor(depth - 1);
2701 }
2702 
2703 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2704  RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2705  if (next == nullptr) return set_replacement(nullptr);
2706  on_success_ = next;
2707  return set_replacement(this);
2708 }
2709 
2710 // We need to check for the following characters: 0x39C 0x3BC 0x178.
2711 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2712  // TODO(dcarney): this could be a lot more efficient.
2713  return range.Contains(0x039C) || range.Contains(0x03BC) ||
2714  range.Contains(0x0178);
2715 }
2716 
2717 
2718 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2719  for (int i = 0; i < ranges->length(); i++) {
2720  // TODO(dcarney): this could be a lot more efficient.
2721  if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2722  }
2723  return false;
2724 }
2725 
2726 RegExpNode* TextNode::FilterOneByte(int depth) {
2727  if (info()->replacement_calculated) return replacement();
2728  if (depth < 0) return this;
2729  DCHECK(!info()->visited);
2730  VisitMarker marker(info());
2731  int element_count = elements()->length();
2732  for (int i = 0; i < element_count; i++) {
2733  TextElement elm = elements()->at(i);
2734  if (elm.text_type() == TextElement::ATOM) {
2735  Vector<const uc16> quarks = elm.atom()->data();
2736  for (int j = 0; j < quarks.length(); j++) {
2737  uint16_t c = quarks[j];
2738  if (elm.atom()->ignore_case()) {
2739  c = unibrow::Latin1::TryConvertToLatin1(c);
2740  }
2741  if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2742  // Replace quark in case we converted to Latin-1.
2743  uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2744  writable_quarks[j] = c;
2745  }
2746  } else {
2747  DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2748  RegExpCharacterClass* cc = elm.char_class();
2749  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2750  CharacterRange::Canonicalize(ranges);
2751  // Now they are in order so we only need to look at the first.
2752  int range_count = ranges->length();
2753  if (cc->is_negated()) {
2754  if (range_count != 0 &&
2755  ranges->at(0).from() == 0 &&
2756  ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2757  // This will be handled in a later filter.
2758  if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2759  continue;
2760  return set_replacement(nullptr);
2761  }
2762  } else {
2763  if (range_count == 0 ||
2764  ranges->at(0).from() > String::kMaxOneByteCharCode) {
2765  // This will be handled in a later filter.
2766  if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2767  continue;
2768  return set_replacement(nullptr);
2769  }
2770  }
2771  }
2772  }
2773  return FilterSuccessor(depth - 1);
2774 }
2775 
2776 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2777  if (info()->replacement_calculated) return replacement();
2778  if (depth < 0) return this;
2779  if (info()->visited) return this;
2780  {
2781  VisitMarker marker(info());
2782 
2783  RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2784  // If we can't continue after the loop then there is no sense in doing the
2785  // loop.
2786  if (continue_replacement == nullptr) return set_replacement(nullptr);
2787  }
2788 
2789  return ChoiceNode::FilterOneByte(depth - 1);
2790 }
2791 
2792 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2793  if (info()->replacement_calculated) return replacement();
2794  if (depth < 0) return this;
2795  if (info()->visited) return this;
2796  VisitMarker marker(info());
2797  int choice_count = alternatives_->length();
2798 
2799  for (int i = 0; i < choice_count; i++) {
2800  GuardedAlternative alternative = alternatives_->at(i);
2801  if (alternative.guards() != nullptr &&
2802  alternative.guards()->length() != 0) {
2803  set_replacement(this);
2804  return this;
2805  }
2806  }
2807 
2808  int surviving = 0;
2809  RegExpNode* survivor = nullptr;
2810  for (int i = 0; i < choice_count; i++) {
2811  GuardedAlternative alternative = alternatives_->at(i);
2812  RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2813  DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2814  if (replacement != nullptr) {
2815  alternatives_->at(i).set_node(replacement);
2816  surviving++;
2817  survivor = replacement;
2818  }
2819  }
2820  if (surviving < 2) return set_replacement(survivor);
2821 
2822  set_replacement(this);
2823  if (surviving == choice_count) {
2824  return this;
2825  }
2826  // Only some of the nodes survived the filtering. We need to rebuild the
2827  // alternatives list.
2828  ZoneList<GuardedAlternative>* new_alternatives =
2829  new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2830  for (int i = 0; i < choice_count; i++) {
2831  RegExpNode* replacement =
2832  alternatives_->at(i).node()->FilterOneByte(depth - 1);
2833  if (replacement != nullptr) {
2834  alternatives_->at(i).set_node(replacement);
2835  new_alternatives->Add(alternatives_->at(i), zone());
2836  }
2837  }
2838  alternatives_ = new_alternatives;
2839  return this;
2840 }
2841 
2842 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2843  if (info()->replacement_calculated) return replacement();
2844  if (depth < 0) return this;
2845  if (info()->visited) return this;
2846  VisitMarker marker(info());
2847  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2848  // afterwards.
2849  RegExpNode* node = alternatives_->at(1).node();
2850  RegExpNode* replacement = node->FilterOneByte(depth - 1);
2851  if (replacement == nullptr) return set_replacement(nullptr);
2852  alternatives_->at(1).set_node(replacement);
2853 
2854  RegExpNode* neg_node = alternatives_->at(0).node();
2855  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2856  // If the negative lookahead is always going to fail then
2857  // we don't need to check it.
2858  if (neg_replacement == nullptr) return set_replacement(replacement);
2859  alternatives_->at(0).set_node(neg_replacement);
2860  return set_replacement(this);
2861 }
2862 
2863 
2864 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2865  RegExpCompiler* compiler,
2866  int characters_filled_in,
2867  bool not_at_start) {
2868  if (body_can_be_zero_length_ || info()->visited) return;
2869  VisitMarker marker(info());
2870  return ChoiceNode::GetQuickCheckDetails(details,
2871  compiler,
2872  characters_filled_in,
2873  not_at_start);
2874 }
2875 
2876 
2877 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2878  BoyerMooreLookahead* bm, bool not_at_start) {
2879  if (body_can_be_zero_length_ || budget <= 0) {
2880  bm->SetRest(offset);
2881  SaveBMInfo(bm, not_at_start, offset);
2882  return;
2883  }
2884  ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2885  SaveBMInfo(bm, not_at_start, offset);
2886 }
2887 
2888 
2889 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2890  RegExpCompiler* compiler,
2891  int characters_filled_in,
2892  bool not_at_start) {
2893  not_at_start = (not_at_start || not_at_start_);
2894  int choice_count = alternatives_->length();
2895  DCHECK_LT(0, choice_count);
2896  alternatives_->at(0).node()->GetQuickCheckDetails(details,
2897  compiler,
2898  characters_filled_in,
2899  not_at_start);
2900  for (int i = 1; i < choice_count; i++) {
2901  QuickCheckDetails new_details(details->characters());
2902  RegExpNode* node = alternatives_->at(i).node();
2903  node->GetQuickCheckDetails(&new_details, compiler,
2904  characters_filled_in,
2905  not_at_start);
2906  // Here we merge the quick match details of the two branches.
2907  details->Merge(&new_details, characters_filled_in);
2908  }
2909 }
2910 
2911 
2912 // Check for [0-9A-Z_a-z].
2913 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2914  Label* word,
2915  Label* non_word,
2916  bool fall_through_on_word) {
2917  if (assembler->CheckSpecialCharacterClass(
2918  fall_through_on_word ? 'w' : 'W',
2919  fall_through_on_word ? non_word : word)) {
2920  // Optimized implementation available.
2921  return;
2922  }
2923  assembler->CheckCharacterGT('z', non_word);
2924  assembler->CheckCharacterLT('0', non_word);
2925  assembler->CheckCharacterGT('a' - 1, word);
2926  assembler->CheckCharacterLT('9' + 1, word);
2927  assembler->CheckCharacterLT('A', non_word);
2928  assembler->CheckCharacterLT('Z' + 1, word);
2929  if (fall_through_on_word) {
2930  assembler->CheckNotCharacter('_', non_word);
2931  } else {
2932  assembler->CheckCharacter('_', word);
2933  }
2934 }
2935 
2936 
2937 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2938 // that matches newline or the start of input).
2939 static void EmitHat(RegExpCompiler* compiler,
2940  RegExpNode* on_success,
2941  Trace* trace) {
2942  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2943  // We will be loading the previous character into the current character
2944  // register.
2945  Trace new_trace(*trace);
2946  new_trace.InvalidateCurrentCharacter();
2947 
2948  Label ok;
2949  if (new_trace.cp_offset() == 0) {
2950  // The start of input counts as a newline in this context, so skip to
2951  // ok if we are at the start.
2952  assembler->CheckAtStart(&ok);
2953  }
2954  // We already checked that we are not at the start of input so it must be
2955  // OK to load the previous character.
2956  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2957  new_trace.backtrack(),
2958  false);
2959  if (!assembler->CheckSpecialCharacterClass('n',
2960  new_trace.backtrack())) {
2961  // Newline means \n, \r, 0x2028 or 0x2029.
2962  if (!compiler->one_byte()) {
2963  assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2964  }
2965  assembler->CheckCharacter('\n', &ok);
2966  assembler->CheckNotCharacter('\r', new_trace.backtrack());
2967  }
2968  assembler->Bind(&ok);
2969  on_success->Emit(compiler, &new_trace);
2970 }
2971 
2972 
2973 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2974 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2975  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2976  Isolate* isolate = assembler->isolate();
2977  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2978  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2979  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2980  if (lookahead == nullptr) {
2981  int eats_at_least =
2982  Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2983  kRecursionBudget,
2984  not_at_start));
2985  if (eats_at_least >= 1) {
2986  BoyerMooreLookahead* bm =
2987  new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
2988  FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2989  if (bm->at(0)->is_non_word())
2990  next_is_word_character = Trace::FALSE_VALUE;
2991  if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2992  }
2993  } else {
2994  if (lookahead->at(0)->is_non_word())
2995  next_is_word_character = Trace::FALSE_VALUE;
2996  if (lookahead->at(0)->is_word())
2997  next_is_word_character = Trace::TRUE_VALUE;
2998  }
2999  bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3000  if (next_is_word_character == Trace::UNKNOWN) {
3001  Label before_non_word;
3002  Label before_word;
3003  if (trace->characters_preloaded() != 1) {
3004  assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3005  }
3006  // Fall through on non-word.
3007  EmitWordCheck(assembler, &before_word, &before_non_word, false);
3008  // Next character is not a word character.
3009  assembler->Bind(&before_non_word);
3010  Label ok;
3011  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3012  assembler->GoTo(&ok);
3013 
3014  assembler->Bind(&before_word);
3015  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3016  assembler->Bind(&ok);
3017  } else if (next_is_word_character == Trace::TRUE_VALUE) {
3018  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3019  } else {
3020  DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3021  BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3022  }
3023 }
3024 
3025 
3026 void AssertionNode::BacktrackIfPrevious(
3027  RegExpCompiler* compiler,
3028  Trace* trace,
3029  AssertionNode::IfPrevious backtrack_if_previous) {
3030  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3031  Trace new_trace(*trace);
3032  new_trace.InvalidateCurrentCharacter();
3033 
3034  Label fall_through, dummy;
3035 
3036  Label* non_word = backtrack_if_previous == kIsNonWord ?
3037  new_trace.backtrack() :
3038  &fall_through;
3039  Label* word = backtrack_if_previous == kIsNonWord ?
3040  &fall_through :
3041  new_trace.backtrack();
3042 
3043  if (new_trace.cp_offset() == 0) {
3044  // The start of input counts as a non-word character, so the question is
3045  // decided if we are at the start.
3046  assembler->CheckAtStart(non_word);
3047  }
3048  // We already checked that we are not at the start of input so it must be
3049  // OK to load the previous character.
3050  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3051  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3052 
3053  assembler->Bind(&fall_through);
3054  on_success()->Emit(compiler, &new_trace);
3055 }
3056 
3057 
3058 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3059  RegExpCompiler* compiler,
3060  int filled_in,
3061  bool not_at_start) {
3062  if (assertion_type_ == AT_START && not_at_start) {
3063  details->set_cannot_match();
3064  return;
3065  }
3066  return on_success()->GetQuickCheckDetails(details,
3067  compiler,
3068  filled_in,
3069  not_at_start);
3070 }
3071 
3072 
3073 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3074  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3075  switch (assertion_type_) {
3076  case AT_END: {
3077  Label ok;
3078  assembler->CheckPosition(trace->cp_offset(), &ok);
3079  assembler->GoTo(trace->backtrack());
3080  assembler->Bind(&ok);
3081  break;
3082  }
3083  case AT_START: {
3084  if (trace->at_start() == Trace::FALSE_VALUE) {
3085  assembler->GoTo(trace->backtrack());
3086  return;
3087  }
3088  if (trace->at_start() == Trace::UNKNOWN) {
3089  assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3090  Trace at_start_trace = *trace;
3091  at_start_trace.set_at_start(Trace::TRUE_VALUE);
3092  on_success()->Emit(compiler, &at_start_trace);
3093  return;
3094  }
3095  }
3096  break;
3097  case AFTER_NEWLINE:
3098  EmitHat(compiler, on_success(), trace);
3099  return;
3100  case AT_BOUNDARY:
3101  case AT_NON_BOUNDARY: {
3102  EmitBoundaryCheck(compiler, trace);
3103  return;
3104  }
3105  }
3106  on_success()->Emit(compiler, trace);
3107 }
3108 
3109 
3110 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3111  if (quick_check == nullptr) return false;
3112  if (offset >= quick_check->characters()) return false;
3113  return quick_check->positions(offset)->determines_perfectly;
3114 }
3115 
3116 
3117 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3118  if (index > *checked_up_to) {
3119  *checked_up_to = index;
3120  }
3121 }
3122 
3123 
3124 // We call this repeatedly to generate code for each pass over the text node.
3125 // The passes are in increasing order of difficulty because we hope one
3126 // of the first passes will fail in which case we are saved the work of the
3127 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3128 // we will check the '%' in the first pass, the case independent 'a' in the
3129 // second pass and the character class in the last pass.
3130 //
3131 // The passes are done from right to left, so for example to test for /bar/
3132 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3133 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3134 // bounds check most of the time. In the example we only need to check for
3135 // end-of-input when loading the putative 'r'.
3136 //
3137 // A slight complication involves the fact that the first character may already
3138 // be fetched into a register by the previous node. In this case we want to
3139 // do the test for that character first. We do this in separate passes. The
3140 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3141 // pass has been performed then subsequent passes will have true in
3142 // first_element_checked to indicate that that character does not need to be
3143 // checked again.
3144 //
3145 // In addition to all this we are passed a Trace, which can
3146 // contain an AlternativeGeneration object. In this AlternativeGeneration
3147 // object we can see details of any quick check that was already passed in
3148 // order to get to the code we are now generating. The quick check can involve
3149 // loading characters, which means we do not need to recheck the bounds
3150 // up to the limit the quick check already checked. In addition the quick
3151 // check can have involved a mask and compare operation which may simplify
3152 // or obviate the need for further checks at some character positions.
3153 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3154  TextEmitPassType pass,
3155  bool preloaded,
3156  Trace* trace,
3157  bool first_element_checked,
3158  int* checked_up_to) {
3159  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3160  Isolate* isolate = assembler->isolate();
3161  bool one_byte = compiler->one_byte();
3162  Label* backtrack = trace->backtrack();
3163  QuickCheckDetails* quick_check = trace->quick_check_performed();
3164  int element_count = elements()->length();
3165  int backward_offset = read_backward() ? -Length() : 0;
3166  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3167  TextElement elm = elements()->at(i);
3168  int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3169  if (elm.text_type() == TextElement::ATOM) {
3170  if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3171  Vector<const uc16> quarks = elm.atom()->data();
3172  for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3173  if (first_element_checked && i == 0 && j == 0) continue;
3174  if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3175  EmitCharacterFunction* emit_function = nullptr;
3176  uc16 quark = quarks[j];
3177  if (elm.atom()->ignore_case()) {
3178  // Everywhere else we assume that a non-Latin-1 character cannot match
3179  // a Latin-1 character. Avoid the cases where this is assumption is
3180  // invalid by using the Latin1 equivalent instead.
3181  quark = unibrow::Latin1::TryConvertToLatin1(quark);
3182  }
3183  switch (pass) {
3184  case NON_LATIN1_MATCH:
3185  DCHECK(one_byte);
3186  if (quark > String::kMaxOneByteCharCode) {
3187  assembler->GoTo(backtrack);
3188  return;
3189  }
3190  break;
3191  case NON_LETTER_CHARACTER_MATCH:
3192  emit_function = &EmitAtomNonLetter;
3193  break;
3194  case SIMPLE_CHARACTER_MATCH:
3195  emit_function = &EmitSimpleCharacter;
3196  break;
3197  case CASE_CHARACTER_MATCH:
3198  emit_function = &EmitAtomLetter;
3199  break;
3200  default:
3201  break;
3202  }
3203  if (emit_function != nullptr) {
3204  bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3205  bool bound_checked =
3206  emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3207  bounds_check, preloaded);
3208  if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3209  }
3210  }
3211  } else {
3212  DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3213  if (pass == CHARACTER_CLASS_MATCH) {
3214  if (first_element_checked && i == 0) continue;
3215  if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3216  RegExpCharacterClass* cc = elm.char_class();
3217  bool bounds_check = *checked_up_to < cp_offset || read_backward();
3218  EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3219  bounds_check, preloaded, zone());
3220  UpdateBoundsCheck(cp_offset, checked_up_to);
3221  }
3222  }
3223  }
3224 }
3225 
3226 
3227 int TextNode::Length() {
3228  TextElement elm = elements()->last();
3229  DCHECK_LE(0, elm.cp_offset());
3230  return elm.cp_offset() + elm.length();
3231 }
3232 
3233 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3234  if (ignore_case) {
3235  return pass == SIMPLE_CHARACTER_MATCH;
3236  } else {
3237  return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3238  }
3239 }
3240 
3241 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3242  ZoneList<CharacterRange>* ranges,
3243  bool read_backward,
3244  RegExpNode* on_success,
3245  JSRegExp::Flags flags) {
3246  DCHECK_NOT_NULL(ranges);
3247  ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3248  elms->Add(TextElement::CharClass(
3249  new (zone) RegExpCharacterClass(zone, ranges, flags)),
3250  zone);
3251  return new (zone) TextNode(elms, read_backward, on_success);
3252 }
3253 
3254 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3255  CharacterRange trail,
3256  bool read_backward,
3257  RegExpNode* on_success,
3258  JSRegExp::Flags flags) {
3259  ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3260  ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3261  ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3262  elms->Add(TextElement::CharClass(
3263  new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3264  zone);
3265  elms->Add(TextElement::CharClass(
3266  new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3267  zone);
3268  return new (zone) TextNode(elms, read_backward, on_success);
3269 }
3270 
3271 
3272 // This generates the code to match a text node. A text node can contain
3273 // straight character sequences (possibly to be matched in a case-independent
3274 // way) and character classes. For efficiency we do not do this in a single
3275 // pass from left to right. Instead we pass over the text node several times,
3276 // emitting code for some character positions every time. See the comment on
3277 // TextEmitPass for details.
3278 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3279  LimitResult limit_result = LimitVersions(compiler, trace);
3280  if (limit_result == DONE) return;
3281  DCHECK(limit_result == CONTINUE);
3282 
3283  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3284  compiler->SetRegExpTooBig();
3285  return;
3286  }
3287 
3288  if (compiler->one_byte()) {
3289  int dummy = 0;
3290  TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3291  }
3292 
3293  bool first_elt_done = false;
3294  int bound_checked_to = trace->cp_offset() - 1;
3295  bound_checked_to += trace->bound_checked_up_to();
3296 
3297  // If a character is preloaded into the current character register then
3298  // check that now.
3299  if (trace->characters_preloaded() == 1) {
3300  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3301  TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3302  false, &bound_checked_to);
3303  }
3304  first_elt_done = true;
3305  }
3306 
3307  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3308  TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3309  first_elt_done, &bound_checked_to);
3310  }
3311 
3312  Trace successor_trace(*trace);
3313  // If we advance backward, we may end up at the start.
3314  successor_trace.AdvanceCurrentPositionInTrace(
3315  read_backward() ? -Length() : Length(), compiler);
3316  successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3317  : Trace::FALSE_VALUE);
3318  RecursionCheck rc(compiler);
3319  on_success()->Emit(compiler, &successor_trace);
3320 }
3321 
3322 
3323 void Trace::InvalidateCurrentCharacter() {
3324  characters_preloaded_ = 0;
3325 }
3326 
3327 
3328 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3329  // We don't have an instruction for shifting the current character register
3330  // down or for using a shifted value for anything so lets just forget that
3331  // we preloaded any characters into it.
3332  characters_preloaded_ = 0;
3333  // Adjust the offsets of the quick check performed information. This
3334  // information is used to find out what we already determined about the
3335  // characters by means of mask and compare.
3336  quick_check_performed_.Advance(by, compiler->one_byte());
3337  cp_offset_ += by;
3338  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3339  compiler->SetRegExpTooBig();
3340  cp_offset_ = 0;
3341  }
3342  bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3343 }
3344 
3345 
3346 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3347  int element_count = elements()->length();
3348  for (int i = 0; i < element_count; i++) {
3349  TextElement elm = elements()->at(i);
3350  if (elm.text_type() == TextElement::CHAR_CLASS) {
3351  RegExpCharacterClass* cc = elm.char_class();
3352 #ifdef V8_INTL_SUPPORT
3353  bool case_equivalents_already_added =
3354  NeedsUnicodeCaseEquivalents(cc->flags());
3355 #else
3356  bool case_equivalents_already_added = false;
3357 #endif
3358  if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3359  // None of the standard character classes is different in the case
3360  // independent case and it slows us down if we don't know that.
3361  if (cc->is_standard(zone())) continue;
3362  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3363  CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3364  is_one_byte);
3365  }
3366  }
3367  }
3368 }
3369 
3370 
3371 int TextNode::GreedyLoopTextLength() { return Length(); }
3372 
3373 
3374 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3375  RegExpCompiler* compiler) {
3376  if (read_backward()) return nullptr;
3377  if (elements()->length() != 1) return nullptr;
3378  TextElement elm = elements()->at(0);
3379  if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3380  RegExpCharacterClass* node = elm.char_class();
3381  ZoneList<CharacterRange>* ranges = node->ranges(zone());
3382  CharacterRange::Canonicalize(ranges);
3383  if (node->is_negated()) {
3384  return ranges->length() == 0 ? on_success() : nullptr;
3385  }
3386  if (ranges->length() != 1) return nullptr;
3387  uint32_t max_char;
3388  if (compiler->one_byte()) {
3389  max_char = String::kMaxOneByteCharCode;
3390  } else {
3391  max_char = String::kMaxUtf16CodeUnit;
3392  }
3393  return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3394 }
3395 
3396 
3397 // Finds the fixed match length of a sequence of nodes that goes from
3398 // this alternative and back to this choice node. If there are variable
3399 // length nodes or other complications in the way then return a sentinel
3400 // value indicating that a greedy loop cannot be constructed.
3401 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3402  GuardedAlternative* alternative) {
3403  int length = 0;
3404  RegExpNode* node = alternative->node();
3405  // Later we will generate code for all these text nodes using recursion
3406  // so we have to limit the max number.
3407  int recursion_depth = 0;
3408  while (node != this) {
3409  if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3410  return kNodeIsTooComplexForGreedyLoops;
3411  }
3412  int node_length = node->GreedyLoopTextLength();
3413  if (node_length == kNodeIsTooComplexForGreedyLoops) {
3414  return kNodeIsTooComplexForGreedyLoops;
3415  }
3416  length += node_length;
3417  SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3418  node = seq_node->on_success();
3419  }
3420  return read_backward() ? -length : length;
3421 }
3422 
3423 
3424 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3425  DCHECK_NULL(loop_node_);
3426  AddAlternative(alt);
3427  loop_node_ = alt.node();
3428 }
3429 
3430 
3431 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3432  DCHECK_NULL(continue_node_);
3433  AddAlternative(alt);
3434  continue_node_ = alt.node();
3435 }
3436 
3437 
3438 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3439  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3440  if (trace->stop_node() == this) {
3441  // Back edge of greedy optimized loop node graph.
3442  int text_length =
3443  GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3444  DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3445  // Update the counter-based backtracking info on the stack. This is an
3446  // optimization for greedy loops (see below).
3447  DCHECK(trace->cp_offset() == text_length);
3448  macro_assembler->AdvanceCurrentPosition(text_length);
3449  macro_assembler->GoTo(trace->loop_label());
3450  return;
3451  }
3452  DCHECK_NULL(trace->stop_node());
3453  if (!trace->is_trivial()) {
3454  trace->Flush(compiler, this);
3455  return;
3456  }
3457  ChoiceNode::Emit(compiler, trace);
3458 }
3459 
3460 
3461 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3462  int eats_at_least) {
3463  int preload_characters = Min(4, eats_at_least);
3464  DCHECK_LE(preload_characters, 4);
3465  if (compiler->macro_assembler()->CanReadUnaligned()) {
3466  bool one_byte = compiler->one_byte();
3467  if (one_byte) {
3468  // We can't preload 3 characters because there is no machine instruction
3469  // to do that. We can't just load 4 because we could be reading
3470  // beyond the end of the string, which could cause a memory fault.
3471  if (preload_characters == 3) preload_characters = 2;
3472  } else {
3473  if (preload_characters > 2) preload_characters = 2;
3474  }
3475  } else {
3476  if (preload_characters > 1) preload_characters = 1;
3477  }
3478  return preload_characters;
3479 }
3480 
3481 
3482 // This class is used when generating the alternatives in a choice node. It
3483 // records the way the alternative is being code generated.
3485  public:
3487  : possible_success(),
3488  expects_preload(false),
3489  after(),
3490  quick_check_details() { }
3491  Label possible_success;
3492  bool expects_preload;
3493  Label after;
3494  QuickCheckDetails quick_check_details;
3495 };
3496 
3497 
3498 // Creates a list of AlternativeGenerations. If the list has a reasonable
3499 // size then it is on the stack, otherwise the excess is on the heap.
3501  public:
3502  AlternativeGenerationList(int count, Zone* zone)
3503  : alt_gens_(count, zone) {
3504  for (int i = 0; i < count && i < kAFew; i++) {
3505  alt_gens_.Add(a_few_alt_gens_ + i, zone);
3506  }
3507  for (int i = kAFew; i < count; i++) {
3508  alt_gens_.Add(new AlternativeGeneration(), zone);
3509  }
3510  }
3512  for (int i = kAFew; i < alt_gens_.length(); i++) {
3513  delete alt_gens_[i];
3514  alt_gens_[i] = nullptr;
3515  }
3516  }
3517 
3518  AlternativeGeneration* at(int i) {
3519  return alt_gens_[i];
3520  }
3521 
3522  private:
3523  static const int kAFew = 10;
3525  AlternativeGeneration a_few_alt_gens_[kAFew];
3526 };
3527 
3528 
3529 static const uc32 kRangeEndMarker = 0x110000;
3530 
3531 // The '2' variant is has inclusive from and exclusive to.
3532 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3533 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3534 static const int kSpaceRanges[] = {
3535  '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3536  0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3537  0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3538 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3539 
3540 static const int kWordRanges[] = {
3541  '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3542 static const int kWordRangeCount = arraysize(kWordRanges);
3543 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3544 static const int kDigitRangeCount = arraysize(kDigitRanges);
3545 static const int kSurrogateRanges[] = {
3546  kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3547 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3548 static const int kLineTerminatorRanges[] = {
3549  0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3550 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3551 
3552 void BoyerMoorePositionInfo::Set(int character) {
3553  SetInterval(Interval(character, character));
3554 }
3555 
3556 
3557 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3558  s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3559  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3560  d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3561  surrogate_ =
3562  AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3563  if (interval.to() - interval.from() >= kMapSize - 1) {
3564  if (map_count_ != kMapSize) {
3565  map_count_ = kMapSize;
3566  for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3567  }
3568  return;
3569  }
3570  for (int i = interval.from(); i <= interval.to(); i++) {
3571  int mod_character = (i & kMask);
3572  if (!map_->at(mod_character)) {
3573  map_count_++;
3574  map_->at(mod_character) = true;
3575  }
3576  if (map_count_ == kMapSize) return;
3577  }
3578 }
3579 
3580 
3581 void BoyerMoorePositionInfo::SetAll() {
3582  s_ = w_ = d_ = kLatticeUnknown;
3583  if (map_count_ != kMapSize) {
3584  map_count_ = kMapSize;
3585  for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3586  }
3587 }
3588 
3589 
3590 BoyerMooreLookahead::BoyerMooreLookahead(
3591  int length, RegExpCompiler* compiler, Zone* zone)
3592  : length_(length),
3593  compiler_(compiler) {
3594  if (compiler->one_byte()) {
3595  max_char_ = String::kMaxOneByteCharCode;
3596  } else {
3597  max_char_ = String::kMaxUtf16CodeUnit;
3598  }
3599  bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3600  for (int i = 0; i < length; i++) {
3601  bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3602  }
3603 }
3604 
3605 
3606 // Find the longest range of lookahead that has the fewest number of different
3607 // characters that can occur at a given position. Since we are optimizing two
3608 // different parameters at once this is a tradeoff.
3609 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3610  int biggest_points = 0;
3611  // If more than 32 characters out of 128 can occur it is unlikely that we can
3612  // be lucky enough to step forwards much of the time.
3613  const int kMaxMax = 32;
3614  for (int max_number_of_chars = 4;
3615  max_number_of_chars < kMaxMax;
3616  max_number_of_chars *= 2) {
3617  biggest_points =
3618  FindBestInterval(max_number_of_chars, biggest_points, from, to);
3619  }
3620  if (biggest_points == 0) return false;
3621  return true;
3622 }
3623 
3624 
3625 // Find the highest-points range between 0 and length_ where the character
3626 // information is not too vague. 'Too vague' means that there are more than
3627 // max_number_of_chars that can occur at this position. Calculates the number
3628 // of points as the product of width-of-the-range and
3629 // probability-of-finding-one-of-the-characters, where the probability is
3630 // calculated using the frequency distribution of the sample subject string.
3631 int BoyerMooreLookahead::FindBestInterval(
3632  int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3633  int biggest_points = old_biggest_points;
3634  static const int kSize = RegExpMacroAssembler::kTableSize;
3635  for (int i = 0; i < length_; ) {
3636  while (i < length_ && Count(i) > max_number_of_chars) i++;
3637  if (i == length_) break;
3638  int remembered_from = i;
3639  bool union_map[kSize];
3640  for (int j = 0; j < kSize; j++) union_map[j] = false;
3641  while (i < length_ && Count(i) <= max_number_of_chars) {
3642  BoyerMoorePositionInfo* map = bitmaps_->at(i);
3643  for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3644  i++;
3645  }
3646  int frequency = 0;
3647  for (int j = 0; j < kSize; j++) {
3648  if (union_map[j]) {
3649  // Add 1 to the frequency to give a small per-character boost for
3650  // the cases where our sampling is not good enough and many
3651  // characters have a frequency of zero. This means the frequency
3652  // can theoretically be up to 2*kSize though we treat it mostly as
3653  // a fraction of kSize.
3654  frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3655  }
3656  }
3657  // We use the probability of skipping times the distance we are skipping to
3658  // judge the effectiveness of this. Actually we have a cut-off: By
3659  // dividing by 2 we switch off the skipping if the probability of skipping
3660  // is less than 50%. This is because the multibyte mask-and-compare
3661  // skipping in quickcheck is more likely to do well on this case.
3662  bool in_quickcheck_range =
3663  ((i - remembered_from < 4) ||
3664  (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3665  // Called 'probability' but it is only a rough estimate and can actually
3666  // be outside the 0-kSize range.
3667  int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3668  int points = (i - remembered_from) * probability;
3669  if (points > biggest_points) {
3670  *from = remembered_from;
3671  *to = i - 1;
3672  biggest_points = points;
3673  }
3674  }
3675  return biggest_points;
3676 }
3677 
3678 
3679 // Take all the characters that will not prevent a successful match if they
3680 // occur in the subject string in the range between min_lookahead and
3681 // max_lookahead (inclusive) measured from the current position. If the
3682 // character at max_lookahead offset is not one of these characters, then we
3683 // can safely skip forwards by the number of characters in the range.
3684 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3685  int max_lookahead,
3686  Handle<ByteArray> boolean_skip_table) {
3687  const int kSize = RegExpMacroAssembler::kTableSize;
3688 
3689  const int kSkipArrayEntry = 0;
3690  const int kDontSkipArrayEntry = 1;
3691 
3692  for (int i = 0; i < kSize; i++) {
3693  boolean_skip_table->set(i, kSkipArrayEntry);
3694  }
3695  int skip = max_lookahead + 1 - min_lookahead;
3696 
3697  for (int i = max_lookahead; i >= min_lookahead; i--) {
3698  BoyerMoorePositionInfo* map = bitmaps_->at(i);
3699  for (int j = 0; j < kSize; j++) {
3700  if (map->at(j)) {
3701  boolean_skip_table->set(j, kDontSkipArrayEntry);
3702  }
3703  }
3704  }
3705 
3706  return skip;
3707 }
3708 
3709 
3710 // See comment above on the implementation of GetSkipTable.
3711 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3712  const int kSize = RegExpMacroAssembler::kTableSize;
3713 
3714  int min_lookahead = 0;
3715  int max_lookahead = 0;
3716 
3717  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3718 
3719  bool found_single_character = false;
3720  int single_character = 0;
3721  for (int i = max_lookahead; i >= min_lookahead; i--) {
3722  BoyerMoorePositionInfo* map = bitmaps_->at(i);
3723  if (map->map_count() > 1 ||
3724  (found_single_character && map->map_count() != 0)) {
3725  found_single_character = false;
3726  break;
3727  }
3728  for (int j = 0; j < kSize; j++) {
3729  if (map->at(j)) {
3730  found_single_character = true;
3731  single_character = j;
3732  break;
3733  }
3734  }
3735  }
3736 
3737  int lookahead_width = max_lookahead + 1 - min_lookahead;
3738 
3739  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3740  // The mask-compare can probably handle this better.
3741  return;
3742  }
3743 
3744  if (found_single_character) {
3745  Label cont, again;
3746  masm->Bind(&again);
3747  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3748  if (max_char_ > kSize) {
3749  masm->CheckCharacterAfterAnd(single_character,
3750  RegExpMacroAssembler::kTableMask,
3751  &cont);
3752  } else {
3753  masm->CheckCharacter(single_character, &cont);
3754  }
3755  masm->AdvanceCurrentPosition(lookahead_width);
3756  masm->GoTo(&again);
3757  masm->Bind(&cont);
3758  return;
3759  }
3760 
3761  Factory* factory = masm->isolate()->factory();
3762  Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3763  int skip_distance = GetSkipTable(
3764  min_lookahead, max_lookahead, boolean_skip_table);
3765  DCHECK_NE(0, skip_distance);
3766 
3767  Label cont, again;
3768  masm->Bind(&again);
3769  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3770  masm->CheckBitInTable(boolean_skip_table, &cont);
3771  masm->AdvanceCurrentPosition(skip_distance);
3772  masm->GoTo(&again);
3773  masm->Bind(&cont);
3774 }
3775 
3776 
3777 /* Code generation for choice nodes.
3778  *
3779  * We generate quick checks that do a mask and compare to eliminate a
3780  * choice. If the quick check succeeds then it jumps to the continuation to
3781  * do slow checks and check subsequent nodes. If it fails (the common case)
3782  * it falls through to the next choice.
3783  *
3784  * Here is the desired flow graph. Nodes directly below each other imply
3785  * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3786  * 3 doesn't have a quick check so we have to call the slow check.
3787  * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3788  * regexp continuation is generated directly after the Sn node, up to the
3789  * next GoTo if we decide to reuse some already generated code. Some
3790  * nodes expect preload_characters to be preloaded into the current
3791  * character register. R nodes do this preloading. Vertices are marked
3792  * F for failures and S for success (possible success in the case of quick
3793  * nodes). L, V, < and > are used as arrow heads.
3794  *
3795  * ----------> R
3796  * |
3797  * V
3798  * Q1 -----> S1
3799  * | S /
3800  * F| /
3801  * | F/
3802  * | /
3803  * | R
3804  * | /
3805  * V L
3806  * Q2 -----> S2
3807  * | S /
3808  * F| /
3809  * | F/
3810  * | /
3811  * | R
3812  * | /
3813  * V L
3814  * S3
3815  * |
3816  * F|
3817  * |
3818  * R
3819  * |
3820  * backtrack V
3821  * <----------Q4
3822  * \ F |
3823  * \ |S
3824  * \ F V
3825  * \-----S4
3826  *
3827  * For greedy loops we push the current position, then generate the code that
3828  * eats the input specially in EmitGreedyLoop. The other choice (the
3829  * continuation) is generated by the normal code in EmitChoices, and steps back
3830  * in the input to the starting position when it fails to match. The loop code
3831  * looks like this (U is the unwind code that steps back in the greedy loop).
3832  *
3833  * _____
3834  * / \
3835  * V |
3836  * ----------> S1 |
3837  * /| |
3838  * / |S |
3839  * F/ \_____/
3840  * /
3841  * |<-----
3842  * | \
3843  * V |S
3844  * Q2 ---> U----->backtrack
3845  * | F /
3846  * S| /
3847  * V F /
3848  * S2--/
3849  */
3850 
3851 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3852  counter_backtrack_trace_.set_backtrack(&label_);
3853  if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3854 }
3855 
3856 
3857 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3858 #ifdef DEBUG
3859  int choice_count = alternatives_->length();
3860  for (int i = 0; i < choice_count - 1; i++) {
3861  GuardedAlternative alternative = alternatives_->at(i);
3862  ZoneList<Guard*>* guards = alternative.guards();
3863  int guard_count = (guards == nullptr) ? 0 : guards->length();
3864  for (int j = 0; j < guard_count; j++) {
3865  DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3866  }
3867  }
3868 #endif
3869 }
3870 
3871 
3872 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3873  Trace* current_trace,
3874  PreloadState* state) {
3875  if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3876  // Save some time by looking at most one machine word ahead.
3877  state->eats_at_least_ =
3878  EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3879  current_trace->at_start() == Trace::FALSE_VALUE);
3880  }
3881  state->preload_characters_ =
3882  CalculatePreloadCharacters(compiler, state->eats_at_least_);
3883 
3884  state->preload_is_current_ =
3885  (current_trace->characters_preloaded() == state->preload_characters_);
3886  state->preload_has_checked_bounds_ = state->preload_is_current_;
3887 }
3888 
3889 
3890 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3891  int choice_count = alternatives_->length();
3892 
3893  if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3894  alternatives_->at(0).node()->Emit(compiler, trace);
3895  return;
3896  }
3897 
3898  AssertGuardsMentionRegisters(trace);
3899 
3900  LimitResult limit_result = LimitVersions(compiler, trace);
3901  if (limit_result == DONE) return;
3902  DCHECK(limit_result == CONTINUE);
3903 
3904  // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3905  // other choice nodes we only flush if we are out of code size budget.
3906  if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3907  trace->Flush(compiler, this);
3908  return;
3909  }
3910 
3911  RecursionCheck rc(compiler);
3912 
3913  PreloadState preload;
3914  preload.init();
3915  GreedyLoopState greedy_loop_state(not_at_start());
3916 
3917  int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3918  AlternativeGenerationList alt_gens(choice_count, zone());
3919 
3920  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3921  trace = EmitGreedyLoop(compiler,
3922  trace,
3923  &alt_gens,
3924  &preload,
3925  &greedy_loop_state,
3926  text_length);
3927  } else {
3928  // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3929  // match the traces produced pre-cleanup.
3930  Label second_choice;
3931  compiler->macro_assembler()->Bind(&second_choice);
3932 
3933  preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3934 
3935  EmitChoices(compiler,
3936  &alt_gens,
3937  0,
3938  trace,
3939  &preload);
3940  }
3941 
3942  // At this point we need to generate slow checks for the alternatives where
3943  // the quick check was inlined. We can recognize these because the associated
3944  // label was bound.
3945  int new_flush_budget = trace->flush_budget() / choice_count;
3946  for (int i = 0; i < choice_count; i++) {
3947  AlternativeGeneration* alt_gen = alt_gens.at(i);
3948  Trace new_trace(*trace);
3949  // If there are actions to be flushed we have to limit how many times
3950  // they are flushed. Take the budget of the parent trace and distribute
3951  // it fairly amongst the children.
3952  if (new_trace.actions() != nullptr) {
3953  new_trace.set_flush_budget(new_flush_budget);
3954  }
3955  bool next_expects_preload =
3956  i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3957  EmitOutOfLineContinuation(compiler,
3958  &new_trace,
3959  alternatives_->at(i),
3960  alt_gen,
3961  preload.preload_characters_,
3962  next_expects_preload);
3963  }
3964 }
3965 
3966 
3967 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3968  Trace* trace,
3969  AlternativeGenerationList* alt_gens,
3970  PreloadState* preload,
3971  GreedyLoopState* greedy_loop_state,
3972  int text_length) {
3973  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3974  // Here we have special handling for greedy loops containing only text nodes
3975  // and other simple nodes. These are handled by pushing the current
3976  // position on the stack and then incrementing the current position each
3977  // time around the switch. On backtrack we decrement the current position
3978  // and check it against the pushed value. This avoids pushing backtrack
3979  // information for each iteration of the loop, which could take up a lot of
3980  // space.
3981  DCHECK(trace->stop_node() == nullptr);
3982  macro_assembler->PushCurrentPosition();
3983  Label greedy_match_failed;
3984  Trace greedy_match_trace;
3985  if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3986  greedy_match_trace.set_backtrack(&greedy_match_failed);
3987  Label loop_label;
3988  macro_assembler->Bind(&loop_label);
3989  greedy_match_trace.set_stop_node(this);
3990  greedy_match_trace.set_loop_label(&loop_label);
3991  alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3992  macro_assembler->Bind(&greedy_match_failed);
3993 
3994  Label second_choice; // For use in greedy matches.
3995  macro_assembler->Bind(&second_choice);
3996 
3997  Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3998 
3999  EmitChoices(compiler,
4000  alt_gens,
4001  1,
4002  new_trace,
4003  preload);
4004 
4005  macro_assembler->Bind(greedy_loop_state->label());
4006  // If we have unwound to the bottom then backtrack.
4007  macro_assembler->CheckGreedyLoop(trace->backtrack());
4008  // Otherwise try the second priority at an earlier position.
4009  macro_assembler->AdvanceCurrentPosition(-text_length);
4010  macro_assembler->GoTo(&second_choice);
4011  return new_trace;
4012 }
4013 
4014 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4015  Trace* trace) {
4016  int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4017  if (alternatives_->length() != 2) return eats_at_least;
4018 
4019  GuardedAlternative alt1 = alternatives_->at(1);
4020  if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4021  return eats_at_least;
4022  }
4023  RegExpNode* eats_anything_node = alt1.node();
4024  if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4025  return eats_at_least;
4026  }
4027 
4028  // Really we should be creating a new trace when we execute this function,
4029  // but there is no need, because the code it generates cannot backtrack, and
4030  // we always arrive here with a trivial trace (since it's the entry to a
4031  // loop. That also implies that there are no preloaded characters, which is
4032  // good, because it means we won't be violating any assumptions by
4033  // overwriting those characters with new load instructions.
4034  DCHECK(trace->is_trivial());
4035 
4036  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4037  Isolate* isolate = macro_assembler->isolate();
4038  // At this point we know that we are at a non-greedy loop that will eat
4039  // any character one at a time. Any non-anchored regexp has such a
4040  // loop prepended to it in order to find where it starts. We look for
4041  // a pattern of the form ...abc... where we can look 6 characters ahead
4042  // and step forwards 3 if the character is not one of abc. Abc need
4043  // not be atoms, they can be any reasonably limited character class or
4044  // small alternation.
4045  BoyerMooreLookahead* bm = bm_info(false);
4046  if (bm == nullptr) {
4047  eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4048  EatsAtLeast(kMaxLookaheadForBoyerMoore,
4049  kRecursionBudget,
4050  false));
4051  if (eats_at_least >= 1) {
4052  bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4053  compiler,
4054  zone());
4055  GuardedAlternative alt0 = alternatives_->at(0);
4056  alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4057  }
4058  }
4059  if (bm != nullptr) {
4060  bm->EmitSkipInstructions(macro_assembler);
4061  }
4062  return eats_at_least;
4063 }
4064 
4065 
4066 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4067  AlternativeGenerationList* alt_gens,
4068  int first_choice,
4069  Trace* trace,
4070  PreloadState* preload) {
4071  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4072  SetUpPreLoad(compiler, trace, preload);
4073 
4074  // For now we just call all choices one after the other. The idea ultimately
4075  // is to use the Dispatch table to try only the relevant ones.
4076  int choice_count = alternatives_->length();
4077 
4078  int new_flush_budget = trace->flush_budget() / choice_count;
4079 
4080  for (int i = first_choice; i < choice_count; i++) {
4081  bool is_last = i == choice_count - 1;
4082  bool fall_through_on_failure = !is_last;
4083  GuardedAlternative alternative = alternatives_->at(i);
4084  AlternativeGeneration* alt_gen = alt_gens->at(i);
4085  alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4086  ZoneList<Guard*>* guards = alternative.guards();
4087  int guard_count = (guards == nullptr) ? 0 : guards->length();
4088  Trace new_trace(*trace);
4089  new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4090  preload->preload_characters_ :
4091  0);
4092  if (preload->preload_has_checked_bounds_) {
4093  new_trace.set_bound_checked_up_to(preload->preload_characters_);
4094  }
4095  new_trace.quick_check_performed()->Clear();
4096  if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4097  if (!is_last) {
4098  new_trace.set_backtrack(&alt_gen->after);
4099  }
4100  alt_gen->expects_preload = preload->preload_is_current_;
4101  bool generate_full_check_inline = false;
4102  if (compiler->optimize() &&
4103  try_to_emit_quick_check_for_alternative(i == 0) &&
4104  alternative.node()->EmitQuickCheck(
4105  compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4106  &alt_gen->possible_success, &alt_gen->quick_check_details,
4107  fall_through_on_failure)) {
4108  // Quick check was generated for this choice.
4109  preload->preload_is_current_ = true;
4110  preload->preload_has_checked_bounds_ = true;
4111  // If we generated the quick check to fall through on possible success,
4112  // we now need to generate the full check inline.
4113  if (!fall_through_on_failure) {
4114  macro_assembler->Bind(&alt_gen->possible_success);
4115  new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4116  new_trace.set_characters_preloaded(preload->preload_characters_);
4117  new_trace.set_bound_checked_up_to(preload->preload_characters_);
4118  generate_full_check_inline = true;
4119  }
4120  } else if (alt_gen->quick_check_details.cannot_match()) {
4121  if (!fall_through_on_failure) {
4122  macro_assembler->GoTo(trace->backtrack());
4123  }
4124  continue;
4125  } else {
4126  // No quick check was generated. Put the full code here.
4127  // If this is not the first choice then there could be slow checks from
4128  // previous cases that go here when they fail. There's no reason to
4129  // insist that they preload characters since the slow check we are about
4130  // to generate probably can't use it.
4131  if (i != first_choice) {
4132  alt_gen->expects_preload = false;
4133  new_trace.InvalidateCurrentCharacter();
4134  }
4135  generate_full_check_inline = true;
4136  }
4137  if (generate_full_check_inline) {
4138  if (new_trace.actions() != nullptr) {
4139  new_trace.set_flush_budget(new_flush_budget);
4140  }
4141  for (int j = 0; j < guard_count; j++) {
4142  GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4143  }
4144  alternative.node()->Emit(compiler, &new_trace);
4145  preload->preload_is_current_ = false;
4146  }
4147  macro_assembler->Bind(&alt_gen->after);
4148  }
4149 }
4150 
4151 
4152 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4153  Trace* trace,
4154  GuardedAlternative alternative,
4155  AlternativeGeneration* alt_gen,
4156  int preload_characters,
4157  bool next_expects_preload) {
4158  if (!alt_gen->possible_success.is_linked()) return;
4159 
4160  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4161  macro_assembler->Bind(&alt_gen->possible_success);
4162  Trace out_of_line_trace(*trace);
4163  out_of_line_trace.set_characters_preloaded(preload_characters);
4164  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4165  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4166  ZoneList<Guard*>* guards = alternative.guards();
4167  int guard_count = (guards == nullptr) ? 0 : guards->length();
4168  if (next_expects_preload) {
4169  Label reload_current_char;
4170  out_of_line_trace.set_backtrack(&reload_current_char);
4171  for (int j = 0; j < guard_count; j++) {
4172  GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4173  }
4174  alternative.node()->Emit(compiler, &out_of_line_trace);
4175  macro_assembler->Bind(&reload_current_char);
4176  // Reload the current character, since the next quick check expects that.
4177  // We don't need to check bounds here because we only get into this
4178  // code through a quick check which already did the checked load.
4179  macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4180  preload_characters);
4181  macro_assembler->GoTo(&(alt_gen->after));
4182  } else {
4183  out_of_line_trace.set_backtrack(&(alt_gen->after));
4184  for (int j = 0; j < guard_count; j++) {
4185  GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4186  }
4187  alternative.node()->Emit(compiler, &out_of_line_trace);
4188  }
4189 }
4190 
4191 
4192 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4193  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4194  LimitResult limit_result = LimitVersions(compiler, trace);
4195  if (limit_result == DONE) return;
4196  DCHECK(limit_result == CONTINUE);
4197 
4198  RecursionCheck rc(compiler);
4199 
4200  switch (action_type_) {
4201  case STORE_POSITION: {
4202  Trace::DeferredCapture
4203  new_capture(data_.u_position_register.reg,
4204  data_.u_position_register.is_capture,
4205  trace);
4206  Trace new_trace = *trace;
4207  new_trace.add_action(&new_capture);
4208  on_success()->Emit(compiler, &new_trace);
4209  break;
4210  }
4211  case INCREMENT_REGISTER: {
4212  Trace::DeferredIncrementRegister
4213  new_increment(data_.u_increment_register.reg);
4214  Trace new_trace = *trace;
4215  new_trace.add_action(&new_increment);
4216  on_success()->Emit(compiler, &new_trace);
4217  break;
4218  }
4219  case SET_REGISTER: {
4220  Trace::DeferredSetRegister
4221  new_set(data_.u_store_register.reg, data_.u_store_register.value);
4222  Trace new_trace = *trace;
4223  new_trace.add_action(&new_set);
4224  on_success()->Emit(compiler, &new_trace);
4225  break;
4226  }
4227  case CLEAR_CAPTURES: {
4228  Trace::DeferredClearCaptures
4229  new_capture(Interval(data_.u_clear_captures.range_from,
4230  data_.u_clear_captures.range_to));
4231  Trace new_trace = *trace;
4232  new_trace.add_action(&new_capture);
4233  on_success()->Emit(compiler, &new_trace);
4234  break;
4235  }
4236  case BEGIN_SUBMATCH:
4237  if (!trace->is_trivial()) {
4238  trace->Flush(compiler, this);
4239  } else {
4240  assembler->WriteCurrentPositionToRegister(
4241  data_.u_submatch.current_position_register, 0);
4242  assembler->WriteStackPointerToRegister(
4243  data_.u_submatch.stack_pointer_register);
4244  on_success()->Emit(compiler, trace);
4245  }
4246  break;
4247  case EMPTY_MATCH_CHECK: {
4248  int start_pos_reg = data_.u_empty_match_check.start_register;
4249  int stored_pos = 0;
4250  int rep_reg = data_.u_empty_match_check.repetition_register;
4251  bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4252  bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4253  if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4254  // If we know we haven't advanced and there is no minimum we
4255  // can just backtrack immediately.
4256  assembler->GoTo(trace->backtrack());
4257  } else if (know_dist && stored_pos < trace->cp_offset()) {
4258  // If we know we've advanced we can generate the continuation
4259  // immediately.
4260  on_success()->Emit(compiler, trace);
4261  } else if (!trace->is_trivial()) {
4262  trace->Flush(compiler, this);
4263  } else {
4264  Label skip_empty_check;
4265  // If we have a minimum number of repetitions we check the current
4266  // number first and skip the empty check if it's not enough.
4267  if (has_minimum) {
4268  int limit = data_.u_empty_match_check.repetition_limit;
4269  assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4270  }
4271  // If the match is empty we bail out, otherwise we fall through
4272  // to the on-success continuation.
4273  assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4274  trace->backtrack());
4275  assembler->Bind(&skip_empty_check);
4276  on_success()->Emit(compiler, trace);
4277  }
4278  break;
4279  }
4280  case POSITIVE_SUBMATCH_SUCCESS: {
4281  if (!trace->is_trivial()) {
4282  trace->Flush(compiler, this);
4283  return;
4284  }
4285  assembler->ReadCurrentPositionFromRegister(
4286  data_.u_submatch.current_position_register);
4287  assembler->ReadStackPointerFromRegister(
4288  data_.u_submatch.stack_pointer_register);
4289  int clear_register_count = data_.u_submatch.clear_register_count;
4290  if (clear_register_count == 0) {
4291  on_success()->Emit(compiler, trace);
4292  return;
4293  }
4294  int clear_registers_from = data_.u_submatch.clear_register_from;
4295  Label clear_registers_backtrack;
4296  Trace new_trace = *trace;
4297  new_trace.set_backtrack(&clear_registers_backtrack);
4298  on_success()->Emit(compiler, &new_trace);
4299 
4300  assembler->Bind(&clear_registers_backtrack);
4301  int clear_registers_to = clear_registers_from + clear_register_count - 1;
4302  assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4303 
4304  DCHECK(trace->backtrack() == nullptr);
4305  assembler->Backtrack();
4306  return;
4307  }
4308  default:
4309  UNREACHABLE();
4310  }
4311 }
4312 
4313 
4314 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4315  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4316  if (!trace->is_trivial()) {
4317  trace->Flush(compiler, this);
4318  return;
4319  }
4320 
4321  LimitResult limit_result = LimitVersions(compiler, trace);
4322  if (limit_result == DONE) return;
4323  DCHECK(limit_result == CONTINUE);
4324 
4325  RecursionCheck rc(compiler);
4326 
4327  DCHECK_EQ(start_reg_ + 1, end_reg_);
4328  if (IgnoreCase(flags_)) {
4329  assembler->CheckNotBackReferenceIgnoreCase(
4330  start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4331  } else {
4332  assembler->CheckNotBackReference(start_reg_, read_backward(),
4333  trace->backtrack());
4334  }
4335  // We are going to advance backward, so we may end up at the start.
4336  if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4337 
4338  // Check that the back reference does not end inside a surrogate pair.
4339  if (IsUnicode(flags_) && !compiler->one_byte()) {
4340  assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4341  }
4342  on_success()->Emit(compiler, trace);
4343 }
4344 
4345 
4346 // -------------------------------------------------------------------
4347 // Dot/dotty output
4348 
4349 
4350 #ifdef DEBUG
4351 
4352 
4353 class DotPrinter: public NodeVisitor {
4354  public:
4355  DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4356  : os_(os),
4357  ignore_case_(ignore_case) {}
4358  void PrintNode(const char* label, RegExpNode* node);
4359  void Visit(RegExpNode* node);
4360  void PrintAttributes(RegExpNode* from);
4361  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4362 #define DECLARE_VISIT(Type) \
4363  virtual void Visit##Type(Type##Node* that);
4364 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4365 #undef DECLARE_VISIT
4366  private:
4367  std::ostream& os_;
4368  bool ignore_case_;
4369 };
4370 
4371 
4372 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4373  os_ << "digraph G {\n graph [label=\"";
4374  for (int i = 0; label[i]; i++) {
4375  switch (label[i]) {
4376  case '\\':
4377  os_ << "\\\\";
4378  break;
4379  case '"':
4380  os_ << "\"";
4381  break;
4382  default:
4383  os_ << label[i];
4384  break;
4385  }
4386  }
4387  os_ << "\"];\n";
4388  Visit(node);
4389  os_ << "}" << std::endl;
4390 }
4391 
4392 
4393 void DotPrinter::Visit(RegExpNode* node) {
4394  if (node->info()->visited) return;
4395  node->info()->visited = true;
4396  node->Accept(this);
4397 }
4398 
4399 
4400 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4401  os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4402  Visit(on_failure);
4403 }
4404 
4405 
4406 class TableEntryBodyPrinter {
4407  public:
4408  TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4409  : os_(os),
4410  choice_(choice) {}
4411  void Call(uc16 from, DispatchTable::Entry entry) {
4412  OutSet* out_set = entry.out_set();
4413  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4414  if (out_set->Get(i)) {
4415  os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4416  << choice()->alternatives()->at(i).node() << ";\n";
4417  }
4418  }
4419  }
4420  private:
4421  ChoiceNode* choice() { return choice_; }
4422  std::ostream& os_;
4423  ChoiceNode* choice_;
4424 };
4425 
4426 
4427 class TableEntryHeaderPrinter {
4428  public:
4429  explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4430  : first_(true),
4431  os_(os) {}
4432  void Call(uc16 from, DispatchTable::Entry entry) {
4433  if (first_) {
4434  first_ = false;
4435  } else {
4436  os_ << "|";
4437  }
4438  os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4439  OutSet* out_set = entry.out_set();
4440  int priority = 0;
4441  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4442  if (out_set->Get(i)) {
4443  if (priority > 0) os_ << "|";
4444  os_ << "<s" << from << "o" << i << "> " << priority;
4445  priority++;
4446  }
4447  }
4448  os_ << "}}";
4449  }
4450 
4451  private:
4452  bool first_;
4453  std::ostream& os_;
4454 };
4455 
4456 
4457 class AttributePrinter {
4458  public:
4459  explicit AttributePrinter(std::ostream& os) // NOLINT
4460  : os_(os),
4461  first_(true) {}
4462  void PrintSeparator() {
4463  if (first_) {
4464  first_ = false;
4465  } else {
4466  os_ << "|";
4467  }
4468  }
4469  void PrintBit(const char* name, bool value) {
4470  if (!value) return;
4471  PrintSeparator();
4472  os_ << "{" << name << "}";
4473  }
4474  void PrintPositive(const char* name, int value) {
4475  if (value < 0) return;
4476  PrintSeparator();
4477  os_ << "{" << name << "|" << value << "}";
4478  }
4479 
4480  private:
4481  std::ostream& os_;
4482  bool first_;
4483 };
4484 
4485 
4486 void DotPrinter::PrintAttributes(RegExpNode* that) {
4487  os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4488  << "margin=0.1, fontsize=10, label=\"{";
4489  AttributePrinter printer(os_);
4490  NodeInfo* info = that->info();
4491  printer.PrintBit("NI", info->follows_newline_interest);
4492  printer.PrintBit("WI", info->follows_word_interest);
4493  printer.PrintBit("SI", info->follows_start_interest);
4494  Label* label = that->label();
4495  if (label->is_bound())
4496  printer.PrintPositive("@", label->pos());
4497  os_ << "}\"];\n"
4498  << " a" << that << " -> n" << that
4499  << " [style=dashed, color=grey, arrowhead=none];\n";
4500 }
4501 
4502 
4503 static const bool kPrintDispatchTable = false;
4504 void DotPrinter::VisitChoice(ChoiceNode* that) {
4505  if (kPrintDispatchTable) {
4506  os_ << " n" << that << " [shape=Mrecord, label=\"";
4507  TableEntryHeaderPrinter header_printer(os_);
4508  that->GetTable(ignore_case_)->ForEach(&header_printer);
4509  os_ << "\"]\n";
4510  PrintAttributes(that);
4511  TableEntryBodyPrinter body_printer(os_, that);
4512  that->GetTable(ignore_case_)->ForEach(&body_printer);
4513  } else {
4514  os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4515  for (int i = 0; i < that->alternatives()->length(); i++) {
4516  GuardedAlternative alt = that->alternatives()->at(i);
4517  os_ << " n" << that << " -> n" << alt.node();
4518  }
4519  }
4520  for (int i = 0; i < that->alternatives()->length(); i++) {
4521  GuardedAlternative alt = that->alternatives()->at(i);
4522  alt.node()->Accept(this);
4523  }
4524 }
4525 
4526 
4527 void DotPrinter::VisitText(TextNode* that) {
4528  Zone* zone = that->zone();
4529  os_ << " n" << that << " [label=\"";
4530  for (int i = 0; i < that->elements()->length(); i++) {
4531  if (i > 0) os_ << " ";
4532  TextElement elm = that->elements()->at(i);
4533  switch (elm.text_type()) {
4534  case TextElement::ATOM: {
4535  Vector<const uc16> data = elm.atom()->data();
4536  for (int i = 0; i < data.length(); i++) {
4537  os_ << static_cast<char>(data[i]);
4538  }
4539  break;
4540  }
4541  case TextElement::CHAR_CLASS: {
4542  RegExpCharacterClass* node = elm.char_class();
4543  os_ << "[";
4544  if (node->is_negated()) os_ << "^";
4545  for (int j = 0; j < node->ranges(zone)->length(); j++) {
4546  CharacterRange range = node->ranges(zone)->at(j);
4547  os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4548  }
4549  os_ << "]";
4550  break;
4551  }
4552  default:
4553  UNREACHABLE();
4554  }
4555  }
4556  os_ << "\", shape=box, peripheries=2];\n";
4557  PrintAttributes(that);
4558  os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4559  Visit(that->on_success());
4560 }
4561 
4562 
4563 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4564  os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4565  << that->end_register() << "\", shape=doubleoctagon];\n";
4566  PrintAttributes(that);
4567  os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4568  Visit(that->on_success());
4569 }
4570 
4571 
4572 void DotPrinter::VisitEnd(EndNode* that) {
4573  os_ << " n" << that << " [style=bold, shape=point];\n";
4574  PrintAttributes(that);
4575 }
4576 
4577 
4578 void DotPrinter::VisitAssertion(AssertionNode* that) {
4579  os_ << " n" << that << " [";
4580  switch (that->assertion_type()) {
4581  case AssertionNode::AT_END:
4582  os_ << "label=\"$\", shape=septagon";
4583  break;
4584  case AssertionNode::AT_START:
4585  os_ << "label=\"^\", shape=septagon";
4586  break;
4587  case AssertionNode::AT_BOUNDARY:
4588  os_ << "label=\"\\b\", shape=septagon";
4589  break;
4590  case AssertionNode::AT_NON_BOUNDARY:
4591  os_ << "label=\"\\B\", shape=septagon";
4592  break;
4593  case AssertionNode::AFTER_NEWLINE:
4594  os_ << "label=\"(?<=\\n)\", shape=septagon";
4595  break;
4596  }
4597  os_ << "];\n";
4598  PrintAttributes(that);
4599  RegExpNode* successor = that->on_success();
4600  os_ << " n" << that << " -> n" << successor << ";\n";
4601  Visit(successor);
4602 }
4603 
4604 
4605 void DotPrinter::VisitAction(ActionNode* that) {
4606  os_ << " n" << that << " [";
4607  switch (that->action_type_) {
4608  case ActionNode::SET_REGISTER:
4609  os_ << "label=\"$" << that->data_.u_store_register.reg
4610  << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4611  break;
4612  case ActionNode::INCREMENT_REGISTER:
4613  os_ << "label=\"$" << that->data_.u_increment_register.reg
4614  << "++\", shape=octagon";
4615  break;
4616  case ActionNode::STORE_POSITION:
4617  os_ << "label=\"$" << that->data_.u_position_register.reg
4618  << ":=$pos\", shape=octagon";
4619  break;
4620  case ActionNode::BEGIN_SUBMATCH:
4621  os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4622  << ":=$pos,begin\", shape=septagon";
4623  break;
4624  case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4625  os_ << "label=\"escape\", shape=septagon";
4626  break;
4627  case ActionNode::EMPTY_MATCH_CHECK:
4628  os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4629  << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4630  << "<" << that->data_.u_empty_match_check.repetition_limit
4631  << "?\", shape=septagon";
4632  break;
4633  case ActionNode::CLEAR_CAPTURES: {
4634  os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4635  << " to $" << that->data_.u_clear_captures.range_to
4636  << "\", shape=septagon";
4637  break;
4638  }
4639  }
4640  os_ << "];\n";
4641  PrintAttributes(that);
4642  RegExpNode* successor = that->on_success();
4643  os_ << " n" << that << " -> n" << successor << ";\n";
4644  Visit(successor);
4645 }
4646 
4647 
4648 class DispatchTableDumper {
4649  public:
4650  explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4651  void Call(uc16 key, DispatchTable::Entry entry);
4652  private:
4653  std::ostream& os_;
4654 };
4655 
4656 
4657 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4658  os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4659  OutSet* set = entry.out_set();
4660  bool first = true;
4661  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4662  if (set->Get(i)) {
4663  if (first) {
4664  first = false;
4665  } else {
4666  os_ << ", ";
4667  }
4668  os_ << i;
4669  }
4670  }
4671  os_ << "}\n";
4672 }
4673 
4674 
4675 void DispatchTable::Dump() {
4676  OFStream os(stderr);
4677  DispatchTableDumper dumper(os);
4678  tree()->ForEach(&dumper);
4679 }
4680 
4681 
4682 void RegExpEngine::DotPrint(const char* label,
4683  RegExpNode* node,
4684  bool ignore_case) {
4685  StdoutStream os;
4686  DotPrinter printer(os, ignore_case);
4687  printer.PrintNode(label, node);
4688 }
4689 
4690 
4691 #endif // DEBUG
4692 
4693 
4694 // -------------------------------------------------------------------
4695 // Tree to graph conversion
4696 
4697 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4698  RegExpNode* on_success) {
4699  ZoneList<TextElement>* elms =
4700  new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4701  elms->Add(TextElement::Atom(this), compiler->zone());
4702  return new (compiler->zone())
4703  TextNode(elms, compiler->read_backward(), on_success);
4704 }
4705 
4706 
4707 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4708  RegExpNode* on_success) {
4709  return new (compiler->zone())
4710  TextNode(elements(), compiler->read_backward(), on_success);
4711 }
4712 
4713 
4714 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4715  const int* special_class,
4716  int length) {
4717  length--; // Remove final marker.
4718  DCHECK_EQ(kRangeEndMarker, special_class[length]);
4719  DCHECK_NE(0, ranges->length());
4720  DCHECK_NE(0, length);
4721  DCHECK_NE(0, special_class[0]);
4722  if (ranges->length() != (length >> 1) + 1) {
4723  return false;
4724  }
4725  CharacterRange range = ranges->at(0);
4726  if (range.from() != 0) {
4727  return false;
4728  }
4729  for (int i = 0; i < length; i += 2) {
4730  if (special_class[i] != (range.to() + 1)) {
4731  return false;
4732  }
4733  range = ranges->at((i >> 1) + 1);
4734  if (special_class[i+1] != range.from()) {
4735  return false;
4736  }
4737  }
4738  if (range.to() != String::kMaxCodePoint) {
4739  return false;
4740  }
4741  return true;
4742 }
4743 
4744 
4745 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4746  const int* special_class,
4747  int length) {
4748  length--; // Remove final marker.
4749  DCHECK_EQ(kRangeEndMarker, special_class[length]);
4750  if (ranges->length() * 2 != length) {
4751  return false;
4752  }
4753  for (int i = 0; i < length; i += 2) {
4754  CharacterRange range = ranges->at(i >> 1);
4755  if (range.from() != special_class[i] ||
4756  range.to() != special_class[i + 1] - 1) {
4757  return false;
4758  }
4759  }
4760  return true;
4761 }
4762 
4763 
4764 bool RegExpCharacterClass::is_standard(Zone* zone) {
4765  // TODO(lrn): Remove need for this function, by not throwing away information
4766  // along the way.
4767  if (is_negated()) {
4768  return false;
4769  }
4770  if (set_.is_standard()) {
4771  return true;
4772  }
4773  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4774  set_.set_standard_set_type('s');
4775  return true;
4776  }
4777  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4778  set_.set_standard_set_type('S');
4779  return true;
4780  }
4781  if (CompareInverseRanges(set_.ranges(zone),
4782  kLineTerminatorRanges,
4783  kLineTerminatorRangeCount)) {
4784  set_.set_standard_set_type('.');
4785  return true;
4786  }
4787  if (CompareRanges(set_.ranges(zone),
4788  kLineTerminatorRanges,
4789  kLineTerminatorRangeCount)) {
4790  set_.set_standard_set_type('n');
4791  return true;
4792  }
4793  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4794  set_.set_standard_set_type('w');
4795  return true;
4796  }
4797  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4798  set_.set_standard_set_type('W');
4799  return true;
4800  }
4801  return false;
4802 }
4803 
4804 
4805 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4806  ZoneList<CharacterRange>* base)
4807  : zone_(zone),
4808  table_(zone),
4809  bmp_(nullptr),
4810  lead_surrogates_(nullptr),
4811  trail_surrogates_(nullptr),
4812  non_bmp_(nullptr) {
4813  // The unicode range splitter categorizes given character ranges into:
4814  // - Code points from the BMP representable by one code unit.
4815  // - Code points outside the BMP that need to be split into surrogate pairs.
4816  // - Lone lead surrogates.
4817  // - Lone trail surrogates.
4818  // Lone surrogates are valid code points, even though no actual characters.
4819  // They require special matching to make sure we do not split surrogate pairs.
4820  // We use the dispatch table to accomplish this. The base range is split up
4821  // by the table by the overlay ranges, and the Call callback is used to
4822  // filter and collect ranges for each category.
4823  for (int i = 0; i < base->length(); i++) {
4824  table_.AddRange(base->at(i), kBase, zone_);
4825  }
4826  // Add overlay ranges.
4827  table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4828  kBmpCodePoints, zone_);
4829  table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4830  kLeadSurrogates, zone_);
4831  table_.AddRange(
4832  CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4833  kTrailSurrogates, zone_);
4834  table_.AddRange(
4835  CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4836  kBmpCodePoints, zone_);
4837  table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4838  kNonBmpCodePoints, zone_);
4839  table_.ForEach(this);
4840 }
4841 
4842 
4843 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4844  OutSet* outset = entry.out_set();
4845  if (!outset->Get(kBase)) return;
4846  ZoneList<CharacterRange>** target = nullptr;
4847  if (outset->Get(kBmpCodePoints)) {
4848  target = &bmp_;
4849  } else if (outset->Get(kLeadSurrogates)) {
4850  target = &lead_surrogates_;
4851  } else if (outset->Get(kTrailSurrogates)) {
4852  target = &trail_surrogates_;
4853  } else {
4854  DCHECK(outset->Get(kNonBmpCodePoints));
4855  target = &non_bmp_;
4856  }
4857  if (*target == nullptr)
4858  *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4859  (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4860 }
4861 
4862 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4863  RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4864  ZoneList<CharacterRange>* bmp = splitter->bmp();
4865  if (bmp == nullptr) return;
4866  JSRegExp::Flags default_flags = JSRegExp::Flags();
4867  result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4868  compiler->zone(), bmp, compiler->read_backward(), on_success,
4869  default_flags)));
4870 }
4871 
4872 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4873  RegExpNode* on_success,
4874  UnicodeRangeSplitter* splitter) {
4875  ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4876  if (non_bmp == nullptr) return;
4877  DCHECK(!compiler->one_byte());
4878  Zone* zone = compiler->zone();
4879  JSRegExp::Flags default_flags = JSRegExp::Flags();
4880  CharacterRange::Canonicalize(non_bmp);
4881  for (int i = 0; i < non_bmp->length(); i++) {
4882  // Match surrogate pair.
4883  // E.g. [\u10005-\u11005] becomes
4884  // \ud800[\udc05-\udfff]|
4885  // [\ud801-\ud803][\udc00-\udfff]|
4886  // \ud804[\udc00-\udc05]
4887  uc32 from = non_bmp->at(i).from();
4888  uc32 to = non_bmp->at(i).to();
4889  uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4890  uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4891  uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4892  uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4893  if (from_l == to_l) {
4894  // The lead surrogate is the same.
4895  result->AddAlternative(
4896  GuardedAlternative(TextNode::CreateForSurrogatePair(
4897  zone, CharacterRange::Singleton(from_l),
4898  CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4899  on_success, default_flags)));
4900  } else {
4901  if (from_t != kTrailSurrogateStart) {
4902  // Add [from_l][from_t-\udfff]
4903  result->AddAlternative(
4904  GuardedAlternative(TextNode::CreateForSurrogatePair(
4905  zone, CharacterRange::Singleton(from_l),
4906  CharacterRange::Range(from_t, kTrailSurrogateEnd),
4907  compiler->read_backward(), on_success, default_flags)));
4908  from_l++;
4909  }
4910  if (to_t != kTrailSurrogateEnd) {
4911  // Add [to_l][\udc00-to_t]
4912  result->AddAlternative(
4913  GuardedAlternative(TextNode::CreateForSurrogatePair(
4914  zone, CharacterRange::Singleton(to_l),
4915  CharacterRange::Range(kTrailSurrogateStart, to_t),
4916  compiler->read_backward(), on_success, default_flags)));
4917  to_l--;
4918  }
4919  if (from_l <= to_l) {
4920  // Add [from_l-to_l][\udc00-\udfff]
4921  result->AddAlternative(
4922  GuardedAlternative(TextNode::CreateForSurrogatePair(
4923  zone, CharacterRange::Range(from_l, to_l),
4924  CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4925  compiler->read_backward(), on_success, default_flags)));
4926  }
4927  }
4928  }
4929 }
4930 
4931 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4932  RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4933  ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4934  JSRegExp::Flags flags) {
4935  Zone* zone = compiler->zone();
4936  RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4937  zone, match, read_backward, on_success, flags);
4938  int stack_register = compiler->UnicodeLookaroundStackRegister();
4939  int position_register = compiler->UnicodeLookaroundPositionRegister();
4940  RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4941  position_register);
4942  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4943  zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4944  return lookaround.ForMatch(negative_match);
4945 }
4946 
4947 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4948  RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4949  ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4950  bool read_backward, JSRegExp::Flags flags) {
4951  Zone* zone = compiler->zone();
4952  int stack_register = compiler->UnicodeLookaroundStackRegister();
4953  int position_register = compiler->UnicodeLookaroundPositionRegister();
4954  RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4955  position_register);
4956  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4957  zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4958  return TextNode::CreateForCharacterRanges(
4959  zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4960 }
4961 
4962 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4963  RegExpNode* on_success,
4964  UnicodeRangeSplitter* splitter) {
4965  JSRegExp::Flags default_flags = JSRegExp::Flags();
4966  ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4967  if (lead_surrogates == nullptr) return;
4968  Zone* zone = compiler->zone();
4969  // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4970  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4971  zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4972 
4973  RegExpNode* match;
4974  if (compiler->read_backward()) {
4975  // Reading backward. Assert that reading forward, there is no trail
4976  // surrogate, and then backward match the lead surrogate.
4977  match = NegativeLookaroundAgainstReadDirectionAndMatch(
4978  compiler, trail_surrogates, lead_surrogates, on_success, true,
4979  default_flags);
4980  } else {
4981  // Reading forward. Forward match the lead surrogate and assert that
4982  // no trail surrogate follows.
4983  match = MatchAndNegativeLookaroundInReadDirection(
4984  compiler, lead_surrogates, trail_surrogates, on_success, false,
4985  default_flags);
4986  }
4987  result->AddAlternative(GuardedAlternative(match));
4988 }
4989 
4990 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4991  RegExpNode* on_success,
4992  UnicodeRangeSplitter* splitter) {
4993  JSRegExp::Flags default_flags = JSRegExp::Flags();
4994  ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
4995  if (trail_surrogates == nullptr) return;
4996  Zone* zone = compiler->zone();
4997  // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4998  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
4999  zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5000 
5001  RegExpNode* match;
5002  if (compiler->read_backward()) {
5003  // Reading backward. Backward match the trail surrogate and assert that no
5004  // lead surrogate precedes it.
5005  match = MatchAndNegativeLookaroundInReadDirection(
5006  compiler, trail_surrogates, lead_surrogates, on_success, true,
5007  default_flags);
5008  } else {
5009  // Reading forward. Assert that reading backward, there is no lead
5010  // surrogate, and then forward match the trail surrogate.
5011  match = NegativeLookaroundAgainstReadDirectionAndMatch(
5012  compiler, lead_surrogates, trail_surrogates, on_success, false,
5013  default_flags);
5014  }
5015  result->AddAlternative(GuardedAlternative(match));
5016 }
5017 
5018 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5019  RegExpNode* on_success) {
5020  // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5021  DCHECK(!compiler->read_backward());
5022  Zone* zone = compiler->zone();
5023  // Advance any character. If the character happens to be a lead surrogate and
5024  // we advanced into the middle of a surrogate pair, it will work out, as
5025  // nothing will match from there. We will have to advance again, consuming
5026  // the associated trail surrogate.
5027  ZoneList<CharacterRange>* range = CharacterRange::List(
5028  zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5029  JSRegExp::Flags default_flags = JSRegExp::Flags();
5030  return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5031  default_flags);
5032 }
5033 
5034 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5035 #ifdef V8_INTL_SUPPORT
5036  DCHECK(CharacterRange::IsCanonical(ranges));
5037 
5038  // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5039  // See also https://crbug.com/v8/6727.
5040  // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5041  // which we use frequently internally. But large ranges can also easily be
5042  // created by the user. We might want to have a more general caching mechanism
5043  // for such ranges.
5044  if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5045 
5046  // Use ICU to compute the case fold closure over the ranges.
5047  icu::UnicodeSet set;
5048  for (int i = 0; i < ranges->length(); i++) {
5049  set.add(ranges->at(i).from(), ranges->at(i).to());
5050  }
5051  ranges->Clear();
5052  set.closeOver(USET_CASE_INSENSITIVE);
5053  // Full case mapping map single characters to multiple characters.
5054  // Those are represented as strings in the set. Remove them so that
5055  // we end up with only simple and common case mappings.
5056  set.removeAllStrings();
5057  for (int i = 0; i < set.getRangeCount(); i++) {
5058  ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5059  zone);
5060  }
5061  // No errors and everything we collected have been ranges.
5062  CharacterRange::Canonicalize(ranges);
5063 #endif // V8_INTL_SUPPORT
5064 }
5065 
5066 
5067 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5068  RegExpNode* on_success) {
5069  set_.Canonicalize();
5070  Zone* zone = compiler->zone();
5071  ZoneList<CharacterRange>* ranges = this->ranges(zone);
5072  if (NeedsUnicodeCaseEquivalents(flags_)) {
5073  AddUnicodeCaseEquivalents(ranges, zone);
5074  }
5075  if (IsUnicode(flags_) && !compiler->one_byte() &&
5076  !contains_split_surrogate()) {
5077  if (is_negated()) {
5078  ZoneList<CharacterRange>* negated =
5079  new (zone) ZoneList<CharacterRange>(2, zone);
5080  CharacterRange::Negate(ranges, negated, zone);
5081  ranges = negated;
5082  }
5083  if (ranges->length() == 0) {
5084  JSRegExp::Flags default_flags;
5085  RegExpCharacterClass* fail =
5086  new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5087  return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5088  }
5089  if (standard_type() == '*') {
5090  return UnanchoredAdvance(compiler, on_success);
5091  } else {
5092  ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5093  UnicodeRangeSplitter splitter(zone, ranges);
5094  AddBmpCharacters(compiler, result, on_success, &splitter);
5095  AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5096  AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5097  AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5098  return result;
5099  }
5100  } else {
5101  return new (zone) TextNode(this, compiler->read_backward(), on_success);
5102  }
5103 }
5104 
5105 
5106 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5107  RegExpAtom* atom1 = (*a)->AsAtom();
5108  RegExpAtom* atom2 = (*b)->AsAtom();
5109  uc16 character1 = atom1->data().at(0);
5110  uc16 character2 = atom2->data().at(0);
5111  if (character1 < character2) return -1;
5112  if (character1 > character2) return 1;
5113  return 0;
5114 }
5115 
5116 
5117 static unibrow::uchar Canonical(
5119  unibrow::uchar c) {
5120  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5121  int length = canonicalize->get(c, '\0', chars);
5122  DCHECK_LE(length, 1);
5123  unibrow::uchar canonical = c;
5124  if (length == 1) canonical = chars[0];
5125  return canonical;
5126 }
5127 
5128 
5129 int CompareFirstCharCaseIndependent(
5131  RegExpTree* const* a, RegExpTree* const* b) {
5132  RegExpAtom* atom1 = (*a)->AsAtom();
5133  RegExpAtom* atom2 = (*b)->AsAtom();
5134  unibrow::uchar character1 = atom1->data().at(0);
5135  unibrow::uchar character2 = atom2->data().at(0);
5136  if (character1 == character2) return 0;
5137  if (character1 >= 'a' || character2 >= 'a') {
5138  character1 = Canonical(canonicalize, character1);
5139  character2 = Canonical(canonicalize, character2);
5140  }
5141  return static_cast<int>(character1) - static_cast<int>(character2);
5142 }
5143 
5144 
5145 // We can stable sort runs of atoms, since the order does not matter if they
5146 // start with different characters.
5147 // Returns true if any consecutive atoms were found.
5148 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5149  ZoneList<RegExpTree*>* alternatives = this->alternatives();
5150  int length = alternatives->length();
5151  bool found_consecutive_atoms = false;
5152  for (int i = 0; i < length; i++) {
5153  while (i < length) {
5154  RegExpTree* alternative = alternatives->at(i);
5155  if (alternative->IsAtom()) break;
5156  i++;
5157  }
5158  // i is length or it is the index of an atom.
5159  if (i == length) break;
5160  int first_atom = i;
5161  JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5162  i++;
5163  while (i < length) {
5164  RegExpTree* alternative = alternatives->at(i);
5165  if (!alternative->IsAtom()) break;
5166  if (alternative->AsAtom()->flags() != flags) break;
5167  i++;
5168  }
5169  // Sort atoms to get ones with common prefixes together.
5170  // This step is more tricky if we are in a case-independent regexp,
5171  // because it would change /is|I/ to /I|is/, and order matters when
5172  // the regexp parts don't match only disjoint starting points. To fix
5173  // this we have a version of CompareFirstChar that uses case-
5174  // independent character classes for comparison.
5175  DCHECK_LT(first_atom, alternatives->length());
5176  DCHECK_LE(i, alternatives->length());
5177  DCHECK_LE(first_atom, i);
5178  if (IgnoreCase(flags)) {
5180  compiler->isolate()->regexp_macro_assembler_canonicalize();
5181  auto compare_closure =
5182  [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5183  return CompareFirstCharCaseIndependent(canonicalize, a, b);
5184  };
5185  alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5186  } else {
5187  alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5188  }
5189  if (i - first_atom > 1) found_consecutive_atoms = true;
5190  }
5191  return found_consecutive_atoms;
5192 }
5193 
5194 
5195 // Optimizes ab|ac|az to a(?:b|c|d).
5196 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5197  Zone* zone = compiler->zone();
5198  ZoneList<RegExpTree*>* alternatives = this->alternatives();
5199  int length = alternatives->length();
5200 
5201  int write_posn = 0;
5202  int i = 0;
5203  while (i < length) {
5204  RegExpTree* alternative = alternatives->at(i);
5205  if (!alternative->IsAtom()) {
5206  alternatives->at(write_posn++) = alternatives->at(i);
5207  i++;
5208  continue;
5209  }
5210  RegExpAtom* const atom = alternative->AsAtom();
5211  JSRegExp::Flags flags = atom->flags();
5212  unibrow::uchar common_prefix = atom->data().at(0);
5213  int first_with_prefix = i;
5214  int prefix_length = atom->length();
5215  i++;
5216  while (i < length) {
5217  alternative = alternatives->at(i);
5218  if (!alternative->IsAtom()) break;
5219  RegExpAtom* const atom = alternative->AsAtom();
5220  if (atom->flags() != flags) break;
5221  unibrow::uchar new_prefix = atom->data().at(0);
5222  if (new_prefix != common_prefix) {
5223  if (!IgnoreCase(flags)) break;
5225  compiler->isolate()->regexp_macro_assembler_canonicalize();
5226  new_prefix = Canonical(canonicalize, new_prefix);
5227  common_prefix = Canonical(canonicalize, common_prefix);
5228  if (new_prefix != common_prefix) break;
5229  }
5230  prefix_length = Min(prefix_length, atom->length());
5231  i++;
5232  }
5233  if (i > first_with_prefix + 2) {
5234  // Found worthwhile run of alternatives with common prefix of at least one
5235  // character. The sorting function above did not sort on more than one
5236  // character for reasons of correctness, but there may still be a longer
5237  // common prefix if the terms were similar or presorted in the input.
5238  // Find out how long the common prefix is.
5239  int run_length = i - first_with_prefix;
5240  RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5241  for (int j = 1; j < run_length && prefix_length > 1; j++) {
5242  RegExpAtom* old_atom =
5243  alternatives->at(j + first_with_prefix)->AsAtom();
5244  for (int k = 1; k < prefix_length; k++) {
5245  if (atom->data().at(k) != old_atom->data().at(k)) {
5246  prefix_length = k;
5247  break;
5248  }
5249  }
5250  }
5251  RegExpAtom* prefix = new (zone)
5252  RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5253  ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5254  pair->Add(prefix, zone);
5255  ZoneList<RegExpTree*>* suffixes =
5256  new (zone) ZoneList<RegExpTree*>(run_length, zone);
5257  for (int j = 0; j < run_length; j++) {
5258  RegExpAtom* old_atom =
5259  alternatives->at(j + first_with_prefix)->AsAtom();
5260  int len = old_atom->length();
5261  if (len == prefix_length) {
5262  suffixes->Add(new (zone) RegExpEmpty(), zone);
5263  } else {
5264  RegExpTree* suffix = new (zone) RegExpAtom(
5265  old_atom->data().SubVector(prefix_length, old_atom->length()),
5266  flags);
5267  suffixes->Add(suffix, zone);
5268  }
5269  }
5270  pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5271  alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5272  } else {
5273  // Just copy any non-worthwhile alternatives.
5274  for (int j = first_with_prefix; j < i; j++) {
5275  alternatives->at(write_posn++) = alternatives->at(j);
5276  }
5277  }
5278  }
5279  alternatives->Rewind(write_posn); // Trim end of array.
5280 }
5281 
5282 
5283 // Optimizes b|c|z to [bcz].
5284 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5285  RegExpCompiler* compiler) {
5286  Zone* zone = compiler->zone();
5287  ZoneList<RegExpTree*>* alternatives = this->alternatives();
5288  int length = alternatives->length();
5289 
5290  int write_posn = 0;
5291  int i = 0;
5292  while (i < length) {
5293  RegExpTree* alternative = alternatives->at(i);
5294  if (!alternative->IsAtom()) {
5295  alternatives->at(write_posn++) = alternatives->at(i);
5296  i++;
5297  continue;
5298  }
5299  RegExpAtom* const atom = alternative->AsAtom();
5300  if (atom->length() != 1) {
5301  alternatives->at(write_posn++) = alternatives->at(i);
5302  i++;
5303  continue;
5304  }
5305  JSRegExp::Flags flags = atom->flags();
5306  DCHECK_IMPLIES(IsUnicode(flags),
5307  !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5308  bool contains_trail_surrogate =
5309  unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5310  int first_in_run = i;
5311  i++;
5312  // Find a run of single-character atom alternatives that have identical
5313  // flags (case independence and unicode-ness).
5314  while (i < length) {
5315  alternative = alternatives->at(i);
5316  if (!alternative->IsAtom()) break;
5317  RegExpAtom* const atom = alternative->AsAtom();
5318  if (atom->length() != 1) break;
5319  if (atom->flags() != flags) break;
5320  DCHECK_IMPLIES(IsUnicode(flags),
5321  !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5322  contains_trail_surrogate |=
5323  unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5324  i++;
5325  }
5326  if (i > first_in_run + 1) {
5327  // Found non-trivial run of single-character alternatives.
5328  int run_length = i - first_in_run;
5329  ZoneList<CharacterRange>* ranges =
5330  new (zone) ZoneList<CharacterRange>(2, zone);
5331  for (int j = 0; j < run_length; j++) {
5332  RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5333  DCHECK_EQ(old_atom->length(), 1);
5334  ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5335  }
5336  RegExpCharacterClass::CharacterClassFlags character_class_flags;
5337  if (IsUnicode(flags) && contains_trail_surrogate) {
5338  character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5339  }
5340  alternatives->at(write_posn++) = new (zone)
5341  RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5342  } else {
5343  // Just copy any trivial alternatives.
5344  for (int j = first_in_run; j < i; j++) {
5345  alternatives->at(write_posn++) = alternatives->at(j);
5346  }
5347  }
5348  }
5349  alternatives->Rewind(write_posn); // Trim end of array.
5350 }
5351 
5352 
5353 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5354  RegExpNode* on_success) {
5355  ZoneList<RegExpTree*>* alternatives = this->alternatives();
5356 
5357  if (alternatives->length() > 2) {
5358  bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5359  if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5360  FixSingleCharacterDisjunctions(compiler);
5361  if (alternatives->length() == 1) {
5362  return alternatives->at(0)->ToNode(compiler, on_success);
5363  }
5364  }
5365 
5366  int length = alternatives->length();
5367 
5368  ChoiceNode* result =
5369  new(compiler->zone()) ChoiceNode(length, compiler->zone());
5370  for (int i = 0; i < length; i++) {
5371  GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5372  on_success));
5373  result->AddAlternative(alternative);
5374  }
5375  return result;
5376 }
5377 
5378 
5379 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5380  RegExpNode* on_success) {
5381  return ToNode(min(),
5382  max(),
5383  is_greedy(),
5384  body(),
5385  compiler,
5386  on_success);
5387 }
5388 
5389 
5390 // Scoped object to keep track of how much we unroll quantifier loops in the
5391 // regexp graph generator.
5393  public:
5394  static const int kMaxExpansionFactor = 6;
5395  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5396  : compiler_(compiler),
5397  saved_expansion_factor_(compiler->current_expansion_factor()),
5398  ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5399  DCHECK_LT(0, factor);
5400  if (ok_to_expand_) {
5401  if (factor > kMaxExpansionFactor) {
5402  // Avoid integer overflow of the current expansion factor.
5403  ok_to_expand_ = false;
5404  compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5405  } else {
5406  int new_factor = saved_expansion_factor_ * factor;
5407  ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5408  compiler->set_current_expansion_factor(new_factor);
5409  }
5410  }
5411  }
5412 
5414  compiler_->set_current_expansion_factor(saved_expansion_factor_);
5415  }
5416 
5417  bool ok_to_expand() { return ok_to_expand_; }
5418 
5419  private:
5420  RegExpCompiler* compiler_;
5421  int saved_expansion_factor_;
5422  bool ok_to_expand_;
5423 
5424  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5425 };
5426 
5427 
5428 RegExpNode* RegExpQuantifier::ToNode(int min,
5429  int max,
5430  bool is_greedy,
5431  RegExpTree* body,
5432  RegExpCompiler* compiler,
5433  RegExpNode* on_success,
5434  bool not_at_start) {
5435  // x{f, t} becomes this:
5436  //
5437  // (r++)<-.
5438  // | `
5439  // | (x)
5440  // v ^
5441  // (r=0)-->(?)---/ [if r < t]
5442  // |
5443  // [if r >= f] \----> ...
5444  //
5445 
5446  // 15.10.2.5 RepeatMatcher algorithm.
5447  // The parser has already eliminated the case where max is 0. In the case
5448  // where max_match is zero the parser has removed the quantifier if min was
5449  // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5450 
5451  // If we know that we cannot match zero length then things are a little
5452  // simpler since we don't need to make the special zero length match check
5453  // from step 2.1. If the min and max are small we can unroll a little in
5454  // this case.
5455  static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5456  static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5457  if (max == 0) return on_success; // This can happen due to recursion.
5458  bool body_can_be_empty = (body->min_match() == 0);
5459  int body_start_reg = RegExpCompiler::kNoRegister;
5460  Interval capture_registers = body->CaptureRegisters();
5461  bool needs_capture_clearing = !capture_registers.is_empty();
5462  Zone* zone = compiler->zone();
5463 
5464  if (body_can_be_empty) {
5465  body_start_reg = compiler->AllocateRegister();
5466  } else if (compiler->optimize() && !needs_capture_clearing) {
5467  // Only unroll if there are no captures and the body can't be
5468  // empty.
5469  {
5470  RegExpExpansionLimiter limiter(
5471  compiler, min + ((max != min) ? 1 : 0));
5472  if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5473  int new_max = (max == kInfinity) ? max : max - min;
5474  // Recurse once to get the loop or optional matches after the fixed
5475  // ones.
5476  RegExpNode* answer = ToNode(
5477  0, new_max, is_greedy, body, compiler, on_success, true);
5478  // Unroll the forced matches from 0 to min. This can cause chains of
5479  // TextNodes (which the parser does not generate). These should be
5480  // combined if it turns out they hinder good code generation.
5481  for (int i = 0; i < min; i++) {
5482  answer = body->ToNode(compiler, answer);
5483  }
5484  return answer;
5485  }
5486  }
5487  if (max <= kMaxUnrolledMaxMatches && min == 0) {
5488  DCHECK_LT(0, max); // Due to the 'if' above.
5489  RegExpExpansionLimiter limiter(compiler, max);
5490  if (limiter.ok_to_expand()) {
5491  // Unroll the optional matches up to max.
5492  RegExpNode* answer = on_success;
5493  for (int i = 0; i < max; i++) {
5494  ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5495  if (is_greedy) {
5496  alternation->AddAlternative(
5497  GuardedAlternative(body->ToNode(compiler, answer)));
5498  alternation->AddAlternative(GuardedAlternative(on_success));
5499  } else {
5500  alternation->AddAlternative(GuardedAlternative(on_success));
5501  alternation->AddAlternative(
5502  GuardedAlternative(body->ToNode(compiler, answer)));
5503  }
5504  answer = alternation;
5505  if (not_at_start && !compiler->read_backward()) {
5506  alternation->set_not_at_start();
5507  }
5508  }
5509  return answer;
5510  }
5511  }
5512  }
5513  bool has_min = min > 0;
5514  bool has_max = max < RegExpTree::kInfinity;
5515  bool needs_counter = has_min || has_max;
5516  int reg_ctr = needs_counter
5517  ? compiler->AllocateRegister()
5518  : RegExpCompiler::kNoRegister;
5519  LoopChoiceNode* center = new (zone)
5520  LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5521  if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5522  RegExpNode* loop_return = needs_counter
5523  ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5524  : static_cast<RegExpNode*>(center);
5525  if (body_can_be_empty) {
5526  // If the body can be empty we need to check if it was and then
5527  // backtrack.
5528  loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5529  reg_ctr,
5530  min,
5531  loop_return);
5532  }
5533  RegExpNode* body_node = body->ToNode(compiler, loop_return);
5534  if (body_can_be_empty) {
5535  // If the body can be empty we need to store the start position
5536  // so we can bail out if it was empty.
5537  body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5538  }
5539  if (needs_capture_clearing) {
5540  // Before entering the body of this loop we need to clear captures.
5541  body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5542  }
5543  GuardedAlternative body_alt(body_node);
5544  if (has_max) {
5545  Guard* body_guard =
5546  new(zone) Guard(reg_ctr, Guard::LT, max);
5547  body_alt.AddGuard(body_guard, zone);
5548  }
5549  GuardedAlternative rest_alt(on_success);
5550  if (has_min) {
5551  Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5552  rest_alt.AddGuard(rest_guard, zone);
5553  }
5554  if (is_greedy) {
5555  center->AddLoopAlternative(body_alt);
5556  center->AddContinueAlternative(rest_alt);
5557  } else {
5558  center->AddContinueAlternative(rest_alt);
5559  center->AddLoopAlternative(body_alt);
5560  }
5561  if (needs_counter) {
5562  return ActionNode::SetRegister(reg_ctr, 0, center);
5563  } else {
5564  return center;
5565  }
5566 }
5567 
5568 namespace {
5569 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5570 // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5571 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5572  RegExpNode* on_success,
5573  RegExpAssertion::AssertionType type,
5574  JSRegExp::Flags flags) {
5575  DCHECK(NeedsUnicodeCaseEquivalents(flags));
5576  Zone* zone = compiler->zone();
5577  ZoneList<CharacterRange>* word_range =
5578  new (zone) ZoneList<CharacterRange>(2, zone);
5579  CharacterRange::AddClassEscape('w', word_range, true, zone);
5580  int stack_register = compiler->UnicodeLookaroundStackRegister();
5581  int position_register = compiler->UnicodeLookaroundPositionRegister();
5582  ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5583  // Add two choices. The (non-)boundary could start with a word or
5584  // a non-word-character.
5585  for (int i = 0; i < 2; i++) {
5586  bool lookbehind_for_word = i == 0;
5587  bool lookahead_for_word =
5588  (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5589  // Look to the left.
5590  RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5591  stack_register, position_register);
5592  RegExpNode* backward = TextNode::CreateForCharacterRanges(
5593  zone, word_range, true, lookbehind.on_match_success(), flags);
5594  // Look to the right.
5595  RegExpLookaround::Builder lookahead(lookahead_for_word,
5596  lookbehind.ForMatch(backward),
5597  stack_register, position_register);
5598  RegExpNode* forward = TextNode::CreateForCharacterRanges(
5599  zone, word_range, false, lookahead.on_match_success(), flags);
5600  result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5601  }
5602  return result;
5603 }
5604 } // anonymous namespace
5605 
5606 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5607  RegExpNode* on_success) {
5608  NodeInfo info;
5609  Zone* zone = compiler->zone();
5610 
5611  switch (assertion_type()) {
5612  case START_OF_LINE:
5613  return AssertionNode::AfterNewline(on_success);
5614  case START_OF_INPUT:
5615  return AssertionNode::AtStart(on_success);
5616  case BOUNDARY:
5617  return NeedsUnicodeCaseEquivalents(flags_)
5618  ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5619  flags_)
5620  : AssertionNode::AtBoundary(on_success);
5621  case NON_BOUNDARY:
5622  return NeedsUnicodeCaseEquivalents(flags_)
5623  ? BoundaryAssertionAsLookaround(compiler, on_success,
5624  NON_BOUNDARY, flags_)
5625  : AssertionNode::AtNonBoundary(on_success);
5626  case END_OF_INPUT:
5627  return AssertionNode::AtEnd(on_success);
5628  case END_OF_LINE: {
5629  // Compile $ in multiline regexps as an alternation with a positive
5630  // lookahead in one side and an end-of-input on the other side.
5631  // We need two registers for the lookahead.
5632  int stack_pointer_register = compiler->AllocateRegister();
5633  int position_register = compiler->AllocateRegister();
5634  // The ChoiceNode to distinguish between a newline and end-of-input.
5635  ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5636  // Create a newline atom.
5637  ZoneList<CharacterRange>* newline_ranges =
5638  new(zone) ZoneList<CharacterRange>(3, zone);
5639  CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5640  JSRegExp::Flags default_flags = JSRegExp::Flags();
5641  RegExpCharacterClass* newline_atom =
5642  new (zone) RegExpCharacterClass('n', default_flags);
5643  TextNode* newline_matcher = new (zone) TextNode(
5644  newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5645  stack_pointer_register, position_register,
5646  0, // No captures inside.
5647  -1, // Ignored if no captures.
5648  on_success));
5649  // Create an end-of-input matcher.
5650  RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5651  stack_pointer_register,
5652  position_register,
5653  newline_matcher);
5654  // Add the two alternatives to the ChoiceNode.
5655  GuardedAlternative eol_alternative(end_of_line);
5656  result->AddAlternative(eol_alternative);
5657  GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5658  result->AddAlternative(end_alternative);
5659  return result;
5660  }
5661  default:
5662  UNREACHABLE();
5663  }
5664  return on_success;
5665 }
5666 
5667 
5668 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5669  RegExpNode* on_success) {
5670  return new (compiler->zone())
5671  BackReferenceNode(RegExpCapture::StartRegister(index()),
5672  RegExpCapture::EndRegister(index()), flags_,
5673  compiler->read_backward(), on_success);
5674 }
5675 
5676 
5677 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5678  RegExpNode* on_success) {
5679  return on_success;
5680 }
5681 
5682 
5683 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5684  int stack_pointer_register,
5685  int position_register,
5686  int capture_register_count,
5687  int capture_register_start)
5688  : is_positive_(is_positive),
5689  on_success_(on_success),
5690  stack_pointer_register_(stack_pointer_register),
5691  position_register_(position_register) {
5692  if (is_positive_) {
5693  on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5694  stack_pointer_register, position_register, capture_register_count,
5695  capture_register_start, on_success_);
5696  } else {
5697  Zone* zone = on_success_->zone();
5698  on_match_success_ = new (zone) NegativeSubmatchSuccess(
5699  stack_pointer_register, position_register, capture_register_count,
5700  capture_register_start, zone);
5701  }
5702 }
5703 
5704 
5705 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5706  if (is_positive_) {
5707  return ActionNode::BeginSubmatch(stack_pointer_register_,
5708  position_register_, match);
5709  } else {
5710  Zone* zone = on_success_->zone();
5711  // We use a ChoiceNode to represent the negative lookaround. The first
5712  // alternative is the negative match. On success, the end node backtracks.
5713  // On failure, the second alternative is tried and leads to success.
5714  // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5715  // first exit when calculating quick checks.
5716  ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5717  GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5718  return ActionNode::BeginSubmatch(stack_pointer_register_,
5719  position_register_, choice_node);
5720  }
5721 }
5722 
5723 
5724 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5725  RegExpNode* on_success) {
5726  int stack_pointer_register = compiler->AllocateRegister();
5727  int position_register = compiler->AllocateRegister();
5728 
5729  const int registers_per_capture = 2;
5730  const int register_of_first_capture = 2;
5731  int register_count = capture_count_ * registers_per_capture;
5732  int register_start =
5733  register_of_first_capture + capture_from_ * registers_per_capture;
5734 
5735  RegExpNode* result;
5736  bool was_reading_backward = compiler->read_backward();
5737  compiler->set_read_backward(type() == LOOKBEHIND);
5738  Builder builder(is_positive(), on_success, stack_pointer_register,
5739  position_register, register_count, register_start);
5740  RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5741  result = builder.ForMatch(match);
5742  compiler->set_read_backward(was_reading_backward);
5743  return result;
5744 }
5745 
5746 
5747 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5748  RegExpNode* on_success) {
5749  return ToNode(body(), index(), compiler, on_success);
5750 }
5751 
5752 
5753 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5754  int index,
5755  RegExpCompiler* compiler,
5756  RegExpNode* on_success) {
5757  DCHECK_NOT_NULL(body);
5758  int start_reg = RegExpCapture::StartRegister(index);
5759  int end_reg = RegExpCapture::EndRegister(index);
5760  if (compiler->read_backward()) std::swap(start_reg, end_reg);
5761  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5762  RegExpNode* body_node = body->ToNode(compiler, store_end);
5763  return ActionNode::StorePosition(start_reg, true, body_node);
5764 }
5765 
5766 
5767 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5768  RegExpNode* on_success) {
5769  ZoneList<RegExpTree*>* children = nodes();
5770  RegExpNode* current = on_success;
5771  if (compiler->read_backward()) {
5772  for (int i = 0; i < children->length(); i++) {
5773  current = children->at(i)->ToNode(compiler, current);
5774  }
5775  } else {
5776  for (int i = children->length() - 1; i >= 0; i--) {
5777  current = children->at(i)->ToNode(compiler, current);
5778  }
5779  }
5780  return current;
5781 }
5782 
5783 
5784 static void AddClass(const int* elmv,
5785  int elmc,
5786  ZoneList<CharacterRange>* ranges,
5787  Zone* zone) {
5788  elmc--;
5789  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5790  for (int i = 0; i < elmc; i += 2) {
5791  DCHECK(elmv[i] < elmv[i + 1]);
5792  ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5793  }
5794 }
5795 
5796 
5797 static void AddClassNegated(const int *elmv,
5798  int elmc,
5799  ZoneList<CharacterRange>* ranges,
5800  Zone* zone) {
5801  elmc--;
5802  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5803  DCHECK_NE(0x0000, elmv[0]);
5804  DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5805  uc16 last = 0x0000;
5806  for (int i = 0; i < elmc; i += 2) {
5807  DCHECK(last <= elmv[i] - 1);
5808  DCHECK(elmv[i] < elmv[i + 1]);
5809  ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5810  last = elmv[i + 1];
5811  }
5812  ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5813 }
5814 
5815 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5816  bool add_unicode_case_equivalents,
5817  Zone* zone) {
5818  if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5819  // See #sec-runtime-semantics-wordcharacters-abstract-operation
5820  // In case of unicode and ignore_case, we need to create the closure over
5821  // case equivalent characters before negating.
5822  ZoneList<CharacterRange>* new_ranges =
5823  new (zone) ZoneList<CharacterRange>(2, zone);
5824  AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5825  AddUnicodeCaseEquivalents(new_ranges, zone);
5826  if (type == 'W') {
5827  ZoneList<CharacterRange>* negated =
5828  new (zone) ZoneList<CharacterRange>(2, zone);
5829  CharacterRange::Negate(new_ranges, negated, zone);
5830  new_ranges = negated;
5831  }
5832  ranges->AddAll(*new_ranges, zone);
5833  return;
5834  }
5835  AddClassEscape(type, ranges, zone);
5836 }
5837 
5838 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5839  Zone* zone) {
5840  switch (type) {
5841  case 's':
5842  AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5843  break;
5844  case 'S':
5845  AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5846  break;
5847  case 'w':
5848  AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5849  break;
5850  case 'W':
5851  AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5852  break;
5853  case 'd':
5854  AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5855  break;
5856  case 'D':
5857  AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5858  break;
5859  case '.':
5860  AddClassNegated(kLineTerminatorRanges,
5861  kLineTerminatorRangeCount,
5862  ranges,
5863  zone);
5864  break;
5865  // This is not a character range as defined by the spec but a
5866  // convenient shorthand for a character class that matches any
5867  // character.
5868  case '*':
5869  ranges->Add(CharacterRange::Everything(), zone);
5870  break;
5871  // This is the set of characters matched by the $ and ^ symbols
5872  // in multiline mode.
5873  case 'n':
5874  AddClass(kLineTerminatorRanges,
5875  kLineTerminatorRangeCount,
5876  ranges,
5877  zone);
5878  break;
5879  default:
5880  UNREACHABLE();
5881  }
5882 }
5883 
5884 
5885 Vector<const int> CharacterRange::GetWordBounds() {
5886  return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5887 }
5888 
5889 // static
5890 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5891  ZoneList<CharacterRange>* ranges,
5892  bool is_one_byte) {
5893  CharacterRange::Canonicalize(ranges);
5894  int range_count = ranges->length();
5895  for (int i = 0; i < range_count; i++) {
5896  CharacterRange range = ranges->at(i);
5897  uc32 bottom = range.from();
5898  if (bottom > String::kMaxUtf16CodeUnit) continue;
5899  uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5900  // Nothing to be done for surrogates.
5901  if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5902  if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5903  if (bottom > String::kMaxOneByteCharCode) continue;
5904  if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5905  }
5906  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5907  if (top == bottom) {
5908  // If this is a singleton we just expand the one character.
5909  int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5910  for (int i = 0; i < length; i++) {
5911  uc32 chr = chars[i];
5912  if (chr != bottom) {
5913  ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5914  }
5915  }
5916  } else {
5917  // If this is a range we expand the characters block by block, expanding
5918  // contiguous subranges (blocks) one at a time. The approach is as
5919  // follows. For a given start character we look up the remainder of the
5920  // block that contains it (represented by the end point), for instance we
5921  // find 'z' if the character is 'c'. A block is characterized by the
5922  // property that all characters uncanonicalize in the same way, except
5923  // that each entry in the result is incremented by the distance from the
5924  // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5925  // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5926  // we've found the end point we look up its uncanonicalization and
5927  // produce a range for each element. For instance for [c-f] we look up
5928  // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5929  // it is not already contained in the input, so [c-f] will be skipped but
5930  // [C-F] will be added. If this range is not completely contained in a
5931  // block we do this for all the blocks covered by the range (handling
5932  // characters that is not in a block as a "singleton block").
5933  unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5934  int pos = bottom;
5935  while (pos <= top) {
5936  int length =
5937  isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5938  uc32 block_end;
5939  if (length == 0) {
5940  block_end = pos;
5941  } else {
5942  DCHECK_EQ(1, length);
5943  block_end = equivalents[0];
5944  }
5945  int end = (block_end > top) ? top : block_end;
5946  length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5947  equivalents);
5948  for (int i = 0; i < length; i++) {
5949  uc32 c = equivalents[i];
5950  uc32 range_from = c - (block_end - pos);
5951  uc32 range_to = c - (block_end - end);
5952  if (!(bottom <= range_from && range_to <= top)) {
5953  ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5954  }
5955  }
5956  pos = end + 1;
5957  }
5958  }
5959  }
5960 }
5961 
5962 
5963 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5964  DCHECK_NOT_NULL(ranges);
5965  int n = ranges->length();
5966  if (n <= 1) return true;
5967  int max = ranges->at(0).to();
5968  for (int i = 1; i < n; i++) {
5969  CharacterRange next_range = ranges->at(i);
5970  if (next_range.from() <= max + 1) return false;
5971  max = next_range.to();
5972  }
5973  return true;
5974 }
5975 
5976 
5977 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5978  if (ranges_ == nullptr) {
5979  ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5980  CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5981  }
5982  return ranges_;
5983 }
5984 
5985 
5986 // Move a number of elements in a zonelist to another position
5987 // in the same list. Handles overlapping source and target areas.
5988 static void MoveRanges(ZoneList<CharacterRange>* list,
5989  int from,
5990  int to,
5991  int count) {
5992  // Ranges are potentially overlapping.
5993  if (from < to) {
5994  for (int i = count - 1; i >= 0; i--) {
5995  list->at(to + i) = list->at(from + i);
5996  }
5997  } else {
5998  for (int i = 0; i < count; i++) {
5999  list->at(to + i) = list->at(from + i);
6000  }
6001  }
6002 }
6003 
6004 
6005 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6006  int count,
6007  CharacterRange insert) {
6008  // Inserts a range into list[0..count[, which must be sorted
6009  // by from value and non-overlapping and non-adjacent, using at most
6010  // list[0..count] for the result. Returns the number of resulting
6011  // canonicalized ranges. Inserting a range may collapse existing ranges into
6012  // fewer ranges, so the return value can be anything in the range 1..count+1.
6013  uc32 from = insert.from();
6014  uc32 to = insert.to();
6015  int start_pos = 0;
6016  int end_pos = count;
6017  for (int i = count - 1; i >= 0; i--) {
6018  CharacterRange current = list->at(i);
6019  if (current.from() > to + 1) {
6020  end_pos = i;
6021  } else if (current.to() + 1 < from) {
6022  start_pos = i + 1;
6023  break;
6024  }
6025  }
6026 
6027  // Inserted range overlaps, or is adjacent to, ranges at positions
6028  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6029  // not affected by the insertion.
6030  // If start_pos == end_pos, the range must be inserted before start_pos.
6031  // if start_pos < end_pos, the entire range from start_pos to end_pos
6032  // must be merged with the insert range.
6033 
6034  if (start_pos == end_pos) {
6035  // Insert between existing ranges at position start_pos.
6036  if (start_pos < count) {
6037  MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6038  }
6039  list->at(start_pos) = insert;
6040  return count + 1;
6041  }
6042  if (start_pos + 1 == end_pos) {
6043  // Replace single existing range at position start_pos.
6044  CharacterRange to_replace = list->at(start_pos);
6045  int new_from = Min(to_replace.from(), from);
6046  int new_to = Max(to_replace.to(), to);
6047  list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6048  return count;
6049  }
6050  // Replace a number of existing ranges from start_pos to end_pos - 1.
6051  // Move the remaining ranges down.
6052 
6053  int new_from = Min(list->at(start_pos).from(), from);
6054  int new_to = Max(list->at(end_pos - 1).to(), to);
6055  if (end_pos < count) {
6056  MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6057  }
6058  list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6059  return count - (end_pos - start_pos) + 1;
6060 }
6061 
6062 
6063 void CharacterSet::Canonicalize() {
6064  // Special/default classes are always considered canonical. The result
6065  // of calling ranges() will be sorted.
6066  if (ranges_ == nullptr) return;
6067  CharacterRange::Canonicalize(ranges_);
6068 }
6069 
6070 
6071 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6072  if (character_ranges->length() <= 1) return;
6073  // Check whether ranges are already canonical (increasing, non-overlapping,
6074  // non-adjacent).
6075  int n = character_ranges->length();
6076  int max = character_ranges->at(0).to();
6077  int i = 1;
6078  while (i < n) {
6079  CharacterRange current = character_ranges->at(i);
6080  if (current.from() <= max + 1) {
6081  break;
6082  }
6083  max = current.to();
6084  i++;
6085  }
6086  // Canonical until the i'th range. If that's all of them, we are done.
6087  if (i == n) return;
6088 
6089  // The ranges at index i and forward are not canonicalized. Make them so by
6090  // doing the equivalent of insertion sort (inserting each into the previous
6091  // list, in order).
6092  // Notice that inserting a range can reduce the number of ranges in the
6093  // result due to combining of adjacent and overlapping ranges.
6094  int read = i; // Range to insert.
6095  int num_canonical = i; // Length of canonicalized part of list.
6096  do {
6097  num_canonical = InsertRangeInCanonicalList(character_ranges,
6098  num_canonical,
6099  character_ranges->at(read));
6100  read++;
6101  } while (read < n);
6102  character_ranges->Rewind(num_canonical);
6103 
6104  DCHECK(CharacterRange::IsCanonical(character_ranges));
6105 }
6106 
6107 
6108 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6109  ZoneList<CharacterRange>* negated_ranges,
6110  Zone* zone) {
6111  DCHECK(CharacterRange::IsCanonical(ranges));
6112  DCHECK_EQ(0, negated_ranges->length());
6113  int range_count = ranges->length();
6114  uc32 from = 0;
6115  int i = 0;
6116  if (range_count > 0 && ranges->at(0).from() == 0) {
6117  from = ranges->at(0).to() + 1;
6118  i = 1;
6119  }
6120  while (i < range_count) {
6121  CharacterRange range = ranges->at(i);
6122  negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6123  from = range.to() + 1;
6124  i++;
6125  }
6126  if (from < String::kMaxCodePoint) {
6127  negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6128  zone);
6129  }
6130 }
6131 
6132 
6133 // -------------------------------------------------------------------
6134 // Splay tree
6135 
6136 
6137 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6138  if (Get(value))
6139  return this;
6140  if (successors(zone) != nullptr) {
6141  for (int i = 0; i < successors(zone)->length(); i++) {
6142  OutSet* successor = successors(zone)->at(i);
6143  if (successor->Get(value))
6144  return successor;
6145  }
6146  } else {
6147  successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6148  }
6149  OutSet* result = new(zone) OutSet(first_, remaining_);
6150  result->Set(value, zone);
6151  successors(zone)->Add(result, zone);
6152  return result;
6153 }
6154 
6155 
6156 void OutSet::Set(unsigned value, Zone *zone) {
6157  if (value < kFirstLimit) {
6158  first_ |= (1 << value);
6159  } else {
6160  if (remaining_ == nullptr)
6161  remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6162  if (remaining_->is_empty() || !remaining_->Contains(value))
6163  remaining_->Add(value, zone);
6164  }
6165 }
6166 
6167 
6168 bool OutSet::Get(unsigned value) const {
6169  if (value < kFirstLimit) {
6170  return (first_ & (1 << value)) != 0;
6171  } else if (remaining_ == nullptr) {
6172  return false;
6173  } else {
6174  return remaining_->Contains(value);
6175  }
6176 }
6177 
6178 
6179 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6180 
6181 
6182 void DispatchTable::AddRange(CharacterRange full_range, int value,
6183  Zone* zone) {
6184  CharacterRange current = full_range;
6185  if (tree()->is_empty()) {
6186  // If this is the first range we just insert into the table.
6187  ZoneSplayTree<Config>::Locator loc;
6188  bool inserted = tree()->Insert(current.from(), &loc);
6189  DCHECK(inserted);
6190  USE(inserted);
6191  loc.set_value(Entry(current.from(), current.to(),
6192  empty()->Extend(value, zone)));
6193  return;
6194  }
6195  // First see if there is a range to the left of this one that
6196  // overlaps.
6197  ZoneSplayTree<Config>::Locator loc;
6198  if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6199  Entry* entry = &loc.value();
6200  // If we've found a range that overlaps with this one, and it
6201  // starts strictly to the left of this one, we have to fix it
6202  // because the following code only handles ranges that start on
6203  // or after the start point of the range we're adding.
6204  if (entry->from() < current.from() && entry->to() >= current.from()) {
6205  // Snap the overlapping range in half around the start point of
6206  // the range we're adding.
6207  CharacterRange left =
6208  CharacterRange::Range(entry->from(), current.from() - 1);
6209  CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6210  // The left part of the overlapping range doesn't overlap.
6211  // Truncate the whole entry to be just the left part.
6212  entry->set_to(left.to());
6213  // The right part is the one that overlaps. We add this part
6214  // to the map and let the next step deal with merging it with
6215  // the range we're adding.
6216  ZoneSplayTree<Config>::Locator loc;
6217  bool inserted = tree()->Insert(right.from(), &loc);
6218  DCHECK(inserted);
6219  USE(inserted);
6220  loc.set_value(Entry(right.from(),
6221  right.to(),
6222  entry->out_set()));
6223  }
6224  }
6225  while (current.is_valid()) {
6226  if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6227  (loc.value().from() <= current.to()) &&
6228  (loc.value().to() >= current.from())) {
6229  Entry* entry = &loc.value();
6230  // We have overlap. If there is space between the start point of
6231  // the range we're adding and where the overlapping range starts
6232  // then we have to add a range covering just that space.
6233  if (current.from() < entry->from()) {
6234  ZoneSplayTree<Config>::Locator ins;
6235  bool inserted = tree()->Insert(current.from(), &ins);
6236  DCHECK(inserted);
6237  USE(inserted);
6238  ins.set_value(Entry(current.from(),
6239  entry->from() - 1,
6240  empty()->Extend(value, zone)));
6241  current.set_from(entry->from());
6242  }
6243  DCHECK_EQ(current.from(), entry->from());
6244  // If the overlapping range extends beyond the one we want to add
6245  // we have to snap the right part off and add it separately.
6246  if (entry->to() > current.to()) {
6247  ZoneSplayTree<Config>::Locator ins;
6248  bool inserted = tree()->Insert(current.to() + 1, &ins);
6249  DCHECK(inserted);
6250  USE(inserted);
6251  ins.set_value(Entry(current.to() + 1,
6252  entry->to(),
6253  entry->out_set()));
6254  entry->set_to(current.to());
6255  }
6256  DCHECK(entry->to() <= current.to());
6257  // The overlapping range is now completely contained by the range
6258  // we're adding so we can just update it and move the start point
6259  // of the range we're adding just past it.
6260  entry->AddValue(value, zone);
6261  DCHECK(entry->to() + 1 > current.from());
6262  current.set_from(entry->to() + 1);
6263  } else {
6264  // There is no overlap so we can just add the range
6265  ZoneSplayTree<Config>::Locator ins;
6266  bool inserted = tree()->Insert(current.from(), &ins);
6267  DCHECK(inserted);
6268  USE(inserted);
6269  ins.set_value(Entry(current.from(),
6270  current.to(),
6271  empty()->Extend(value, zone)));
6272  break;
6273  }
6274  }
6275 }
6276 
6277 
6278 OutSet* DispatchTable::Get(uc32 value) {
6279  ZoneSplayTree<Config>::Locator loc;
6280  if (!tree()->FindGreatestLessThan(value, &loc))
6281  return empty();
6282  Entry* entry = &loc.value();
6283  if (value <= entry->to())
6284  return entry->out_set();
6285  else
6286  return empty();
6287 }
6288 
6289 
6290 // -------------------------------------------------------------------
6291 // Analysis
6292 
6293 
6294 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6295  StackLimitCheck check(isolate());
6296  if (check.HasOverflowed()) {
6297  fail("Stack overflow");
6298  return;
6299  }
6300  if (that->info()->been_analyzed || that->info()->being_analyzed)
6301  return;
6302  that->info()->being_analyzed = true;
6303  that->Accept(this);
6304  that->info()->being_analyzed = false;
6305  that->info()->been_analyzed = true;
6306 }
6307 
6308 
6309 void Analysis::VisitEnd(EndNode* that) {
6310  // nothing to do
6311 }
6312 
6313 
6314 void TextNode::CalculateOffsets() {
6315  int element_count = elements()->length();
6316  // Set up the offsets of the elements relative to the start. This is a fixed
6317  // quantity since a TextNode can only contain fixed-width things.
6318  int cp_offset = 0;
6319  for (int i = 0; i < element_count; i++) {
6320  TextElement& elm = elements()->at(i);
6321  elm.set_cp_offset(cp_offset);
6322  cp_offset += elm.length();
6323  }
6324 }
6325 
6326 
6327 void Analysis::VisitText(TextNode* that) {
6328  that->MakeCaseIndependent(isolate(), is_one_byte_);
6329  EnsureAnalyzed(that->on_success());
6330  if (!has_failed()) {
6331  that->CalculateOffsets();
6332  }
6333 }
6334 
6335 
6336 void Analysis::VisitAction(ActionNode* that) {
6337  RegExpNode* target = that->on_success();
6338  EnsureAnalyzed(target);
6339  if (!has_failed()) {
6340  // If the next node is interested in what it follows then this node
6341  // has to be interested too so it can pass the information on.
6342  that->info()->AddFromFollowing(target->info());
6343  }
6344 }
6345 
6346 
6347 void Analysis::VisitChoice(ChoiceNode* that) {
6348  NodeInfo* info = that->info();
6349  for (int i = 0; i < that->alternatives()->length(); i++) {
6350  RegExpNode* node = that->alternatives()->at(i).node();
6351  EnsureAnalyzed(node);
6352  if (has_failed()) return;
6353  // Anything the following nodes need to know has to be known by
6354  // this node also, so it can pass it on.
6355  info->AddFromFollowing(node->info());
6356  }
6357 }
6358 
6359 
6360 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6361  NodeInfo* info = that->info();
6362  for (int i = 0; i < that->alternatives()->length(); i++) {
6363  RegExpNode* node = that->alternatives()->at(i).node();
6364  if (node != that->loop_node()) {
6365  EnsureAnalyzed(node);
6366  if (has_failed()) return;
6367  info->AddFromFollowing(node->info());
6368  }
6369  }
6370  // Check the loop last since it may need the value of this node
6371  // to get a correct result.
6372  EnsureAnalyzed(that->loop_node());
6373  if (!has_failed()) {
6374  info->AddFromFollowing(that->loop_node()->info());
6375  }
6376 }
6377 
6378 
6379 void Analysis::VisitBackReference(BackReferenceNode* that) {
6380  EnsureAnalyzed(that->on_success());
6381 }
6382 
6383 
6384 void Analysis::VisitAssertion(AssertionNode* that) {
6385  EnsureAnalyzed(that->on_success());
6386 }
6387 
6388 
6389 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6390  BoyerMooreLookahead* bm,
6391  bool not_at_start) {
6392  // Working out the set of characters that a backreference can match is too
6393  // hard, so we just say that any character can match.
6394  bm->SetRest(offset);
6395  SaveBMInfo(bm, not_at_start, offset);
6396 }
6397 
6398 
6399 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6400  RegExpMacroAssembler::kTableSize);
6401 
6402 
6403 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6404  BoyerMooreLookahead* bm, bool not_at_start) {
6405  ZoneList<GuardedAlternative>* alts = alternatives();
6406  budget = (budget - 1) / alts->length();
6407  for (int i = 0; i < alts->length(); i++) {
6408  GuardedAlternative& alt = alts->at(i);
6409  if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6410  bm->SetRest(offset); // Give up trying to fill in info.
6411  SaveBMInfo(bm, not_at_start, offset);
6412  return;
6413  }
6414  alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6415  }
6416  SaveBMInfo(bm, not_at_start, offset);
6417 }
6418 
6419 
6420 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6421  BoyerMooreLookahead* bm, bool not_at_start) {
6422  if (initial_offset >= bm->length()) return;
6423  int offset = initial_offset;
6424  int max_char = bm->max_char();
6425  for (int i = 0; i < elements()->length(); i++) {
6426  if (offset >= bm->length()) {
6427  if (initial_offset == 0) set_bm_info(not_at_start, bm);
6428  return;
6429  }
6430  TextElement text = elements()->at(i);
6431  if (text.text_type() == TextElement::ATOM) {
6432  RegExpAtom* atom = text.atom();
6433  for (int j = 0; j < atom->length(); j++, offset++) {
6434  if (offset >= bm->length()) {
6435  if (initial_offset == 0) set_bm_info(not_at_start, bm);
6436  return;
6437  }
6438  uc16 character = atom->data()[j];
6439  if (IgnoreCase(atom->flags())) {
6440  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6441  int length = GetCaseIndependentLetters(
6442  isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6443  chars);
6444  for (int j = 0; j < length; j++) {
6445  bm->Set(offset, chars[j]);
6446  }
6447  } else {
6448  if (character <= max_char) bm->Set(offset, character);
6449  }
6450  }
6451  } else {
6452  DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6453  RegExpCharacterClass* char_class = text.char_class();
6454  ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6455  if (char_class->is_negated()) {
6456  bm->SetAll(offset);
6457  } else {
6458  for (int k = 0; k < ranges->length(); k++) {
6459  CharacterRange& range = ranges->at(k);
6460  if (range.from() > max_char) continue;
6461  int to = Min(max_char, static_cast<int>(range.to()));
6462  bm->SetInterval(offset, Interval(range.from(), to));
6463  }
6464  }
6465  offset++;
6466  }
6467  }
6468  if (offset >= bm->length()) {
6469  if (initial_offset == 0) set_bm_info(not_at_start, bm);
6470  return;
6471  }
6472  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6473  true); // Not at start after a text node.
6474  if (initial_offset == 0) set_bm_info(not_at_start, bm);
6475 }
6476 
6477 
6478 // -------------------------------------------------------------------
6479 // Dispatch table construction
6480 
6481 
6482 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6483  AddRange(CharacterRange::Everything());
6484 }
6485 
6486 
6487 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6488  node->set_being_calculated(true);
6489  ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6490  for (int i = 0; i < alternatives->length(); i++) {
6491  set_choice_index(i);
6492  alternatives->at(i).node()->Accept(this);
6493  }
6494  node->set_being_calculated(false);
6495 }
6496 
6497 
6499  public:
6500  explicit AddDispatchRange(DispatchTableConstructor* constructor)
6501  : constructor_(constructor) { }
6502  void Call(uc32 from, DispatchTable::Entry entry);
6503  private:
6504  DispatchTableConstructor* constructor_;
6505 };
6506 
6507 
6508 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6509  constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6510 }
6511 
6512 
6513 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6514  if (node->being_calculated())
6515  return;
6516  DispatchTable* table = node->GetTable(ignore_case_);
6517  AddDispatchRange adder(this);
6518  table->ForEach(&adder);
6519 }
6520 
6521 
6522 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6523  // TODO(160): Find the node that we refer back to and propagate its start
6524  // set back to here. For now we just accept anything.
6525  AddRange(CharacterRange::Everything());
6526 }
6527 
6528 
6529 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6530  RegExpNode* target = that->on_success();
6531  target->Accept(this);
6532 }
6533 
6534 
6535 static int CompareRangeByFrom(const CharacterRange* a,
6536  const CharacterRange* b) {
6537  return Compare<uc16>(a->from(), b->from());
6538 }
6539 
6540 
6541 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6542  ranges->Sort(CompareRangeByFrom);
6543  uc16 last = 0;
6544  for (int i = 0; i < ranges->length(); i++) {
6545  CharacterRange range = ranges->at(i);
6546  if (last < range.from())
6547  AddRange(CharacterRange::Range(last, range.from() - 1));
6548  if (range.to() >= last) {
6549  if (range.to() == String::kMaxCodePoint) {
6550  return;
6551  } else {
6552  last = range.to() + 1;
6553  }
6554  }
6555  }
6556  AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6557 }
6558 
6559 
6560 void DispatchTableConstructor::VisitText(TextNode* that) {
6561  TextElement elm = that->elements()->at(0);
6562  switch (elm.text_type()) {
6563  case TextElement::ATOM: {
6564  uc16 c = elm.atom()->data()[0];
6565  AddRange(CharacterRange::Range(c, c));
6566  break;
6567  }
6568  case TextElement::CHAR_CLASS: {
6569  RegExpCharacterClass* tree = elm.char_class();
6570  ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6571  if (tree->is_negated()) {
6572  AddInverse(ranges);
6573  } else {
6574  for (int i = 0; i < ranges->length(); i++)
6575  AddRange(ranges->at(i));
6576  }
6577  break;
6578  }
6579  default: {
6580  UNIMPLEMENTED();
6581  }
6582  }
6583 }
6584 
6585 
6586 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6587  RegExpNode* target = that->on_success();
6588  target->Accept(this);
6589 }
6590 
6591 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6592  RegExpNode* on_success,
6593  JSRegExp::Flags flags) {
6594  // If the regexp matching starts within a surrogate pair, step back
6595  // to the lead surrogate and start matching from there.
6596  DCHECK(!compiler->read_backward());
6597  Zone* zone = compiler->zone();
6598  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6599  zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6600  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6601  zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6602 
6603  ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6604 
6605  int stack_register = compiler->UnicodeLookaroundStackRegister();
6606  int position_register = compiler->UnicodeLookaroundPositionRegister();
6607  RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6608  zone, lead_surrogates, true, on_success, flags);
6609  RegExpLookaround::Builder builder(true, step_back, stack_register,
6610  position_register);
6611  RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6612  zone, trail_surrogates, false, builder.on_match_success(), flags);
6613 
6614  optional_step_back->AddAlternative(
6615  GuardedAlternative(builder.ForMatch(match_trail)));
6616  optional_step_back->AddAlternative(GuardedAlternative(on_success));
6617 
6618  return optional_step_back;
6619 }
6620 
6621 
6622 RegExpEngine::CompilationResult RegExpEngine::Compile(
6623  Isolate* isolate, Zone* zone, RegExpCompileData* data,
6624  JSRegExp::Flags flags, Handle<String> pattern,
6625  Handle<String> sample_subject, bool is_one_byte) {
6626  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6627  return IrregexpRegExpTooBig(isolate);
6628  }
6629  bool is_sticky = IsSticky(flags);
6630  bool is_global = IsGlobal(flags);
6631  bool is_unicode = IsUnicode(flags);
6632  RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6633 
6634  if (compiler.optimize())
6635  compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6636 
6637  // Sample some characters from the middle of the string.
6638  static const int kSampleSize = 128;
6639 
6640  sample_subject = String::Flatten(isolate, sample_subject);
6641  int chars_sampled = 0;
6642  int half_way = (sample_subject->length() - kSampleSize) / 2;
6643  for (int i = Max(0, half_way);
6644  i < sample_subject->length() && chars_sampled < kSampleSize;
6645  i++, chars_sampled++) {
6646  compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6647  }
6648 
6649  // Wrap the body of the regexp in capture #0.
6650  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6651  0,
6652  &compiler,
6653  compiler.accept());
6654  RegExpNode* node = captured_body;
6655  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6656  bool is_start_anchored = data->tree->IsAnchoredAtStart();
6657  int max_length = data->tree->max_match();
6658  if (!is_start_anchored && !is_sticky) {
6659  // Add a .*? at the beginning, outside the body capture, unless
6660  // this expression is anchored at the beginning or sticky.
6661  JSRegExp::Flags default_flags = JSRegExp::Flags();
6662  RegExpNode* loop_node = RegExpQuantifier::ToNode(
6663  0, RegExpTree::kInfinity, false,
6664  new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6665  captured_body, data->contains_anchor);
6666 
6667  if (data->contains_anchor) {
6668  // Unroll loop once, to take care of the case that might start
6669  // at the start of input.
6670  ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6671  first_step_node->AddAlternative(GuardedAlternative(captured_body));
6672  first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6673  new (zone) RegExpCharacterClass('*', default_flags), false,
6674  loop_node)));
6675  node = first_step_node;
6676  } else {
6677  node = loop_node;
6678  }
6679  }
6680  if (is_one_byte) {
6681  node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6682  // Do it again to propagate the new nodes to places where they were not
6683  // put because they had not been calculated yet.
6684  if (node != nullptr) {
6685  node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6686  }
6687  } else if (is_unicode && (is_global || is_sticky)) {
6688  node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6689  }
6690 
6691  if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6692  data->node = node;
6693  Analysis analysis(isolate, is_one_byte);
6694  analysis.EnsureAnalyzed(node);
6695  if (analysis.has_failed()) {
6696  const char* error_message = analysis.error_message();
6697  return CompilationResult(isolate, error_message);
6698  }
6699 
6700  // Create the correct assembler for the architecture.
6701 #ifndef V8_INTERPRETED_REGEXP
6702  // Native regexp implementation.
6703 
6704  NativeRegExpMacroAssembler::Mode mode =
6705  is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6706  : NativeRegExpMacroAssembler::UC16;
6707 
6708 #if V8_TARGET_ARCH_IA32
6709  RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6710  (data->capture_count + 1) * 2);
6711 #elif V8_TARGET_ARCH_X64
6712  RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6713  (data->capture_count + 1) * 2);
6714 #elif V8_TARGET_ARCH_ARM
6715  RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6716  (data->capture_count + 1) * 2);
6717 #elif V8_TARGET_ARCH_ARM64
6718  RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6719  (data->capture_count + 1) * 2);
6720 #elif V8_TARGET_ARCH_S390
6721  RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6722  (data->capture_count + 1) * 2);
6723 #elif V8_TARGET_ARCH_PPC
6724  RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6725  (data->capture_count + 1) * 2);
6726 #elif V8_TARGET_ARCH_MIPS
6727  RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6728  (data->capture_count + 1) * 2);
6729 #elif V8_TARGET_ARCH_MIPS64
6730  RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6731  (data->capture_count + 1) * 2);
6732 #else
6733 #error "Unsupported architecture"
6734 #endif
6735 
6736 #else // V8_INTERPRETED_REGEXP
6737  // Interpreted regexp implementation.
6738  EmbeddedVector<byte, 1024> codes;
6739  RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6740 #endif // V8_INTERPRETED_REGEXP
6741 
6742  macro_assembler.set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6743 
6744  // Inserted here, instead of in Assembler, because it depends on information
6745  // in the AST that isn't replicated in the Node structure.
6746  static const int kMaxBacksearchLimit = 1024;
6747  if (is_end_anchored && !is_start_anchored && !is_sticky &&
6748  max_length < kMaxBacksearchLimit) {
6749  macro_assembler.SetCurrentPositionFromEnd(max_length);
6750  }
6751 
6752  if (is_global) {
6753  RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6754  if (data->tree->min_match() > 0) {
6755  mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6756  } else if (is_unicode) {
6757  mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6758  }
6759  macro_assembler.set_global_mode(mode);
6760  }
6761 
6762  return compiler.Assemble(isolate, &macro_assembler, node, data->capture_count,
6763  pattern);
6764 }
6765 
6766 bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6767  Heap* heap = isolate->heap();
6768  bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6769  if (heap->isolate()->total_regexp_code_generated() >
6770  RegExpImpl::kRegExpCompiledLimit &&
6771  heap->CommittedMemoryExecutable() >
6772  RegExpImpl::kRegExpExecutableMemoryLimit) {
6773  too_much = true;
6774  }
6775  return too_much;
6776 }
6777 
6778 Object* RegExpResultsCache::Lookup(Heap* heap, String key_string,
6779  Object* key_pattern,
6780  FixedArray* last_match_cache,
6781  ResultsCacheType type) {
6782  FixedArray cache;
6783  if (!key_string->IsInternalizedString()) return Smi::kZero;
6784  if (type == STRING_SPLIT_SUBSTRINGS) {
6785  DCHECK(key_pattern->IsString());
6786  if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6787  cache = heap->string_split_cache();
6788  } else {
6789  DCHECK(type == REGEXP_MULTIPLE_INDICES);
6790  DCHECK(key_pattern->IsFixedArray());
6791  cache = heap->regexp_multiple_cache();
6792  }
6793 
6794  uint32_t hash = key_string->Hash();
6795  uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6796  ~(kArrayEntriesPerCacheEntry - 1));
6797  if (cache->get(index + kStringOffset) != key_string ||
6798  cache->get(index + kPatternOffset) != key_pattern) {
6799  index =
6800  ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6801  if (cache->get(index + kStringOffset) != key_string ||
6802  cache->get(index + kPatternOffset) != key_pattern) {
6803  return Smi::kZero;
6804  }
6805  }
6806 
6807  *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6808  return cache->get(index + kArrayOffset);
6809 }
6810 
6811 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6812  Handle<Object> key_pattern,
6813  Handle<FixedArray> value_array,
6814  Handle<FixedArray> last_match_cache,
6815  ResultsCacheType type) {
6816  Factory* factory = isolate->factory();
6817  Handle<FixedArray> cache;
6818  if (!key_string->IsInternalizedString()) return;
6819  if (type == STRING_SPLIT_SUBSTRINGS) {
6820  DCHECK(key_pattern->IsString());
6821  if (!key_pattern->IsInternalizedString()) return;
6822  cache = factory->string_split_cache();
6823  } else {
6824  DCHECK(type == REGEXP_MULTIPLE_INDICES);
6825  DCHECK(key_pattern->IsFixedArray());
6826  cache = factory->regexp_multiple_cache();
6827  }
6828 
6829  uint32_t hash = key_string->Hash();
6830  uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6831  ~(kArrayEntriesPerCacheEntry - 1));
6832  if (cache->get(index + kStringOffset) == Smi::kZero) {
6833  cache->set(index + kStringOffset, *key_string);
6834  cache->set(index + kPatternOffset, *key_pattern);
6835  cache->set(index + kArrayOffset, *value_array);
6836  cache->set(index + kLastMatchOffset, *last_match_cache);
6837  } else {
6838  uint32_t index2 =
6839  ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6840  if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6841  cache->set(index2 + kStringOffset, *key_string);
6842  cache->set(index2 + kPatternOffset, *key_pattern);
6843  cache->set(index2 + kArrayOffset, *value_array);
6844  cache->set(index2 + kLastMatchOffset, *last_match_cache);
6845  } else {
6846  cache->set(index2 + kStringOffset, Smi::kZero);
6847  cache->set(index2 + kPatternOffset, Smi::kZero);
6848  cache->set(index2 + kArrayOffset, Smi::kZero);
6849  cache->set(index2 + kLastMatchOffset, Smi::kZero);
6850  cache->set(index + kStringOffset, *key_string);
6851  cache->set(index + kPatternOffset, *key_pattern);
6852  cache->set(index + kArrayOffset, *value_array);
6853  cache->set(index + kLastMatchOffset, *last_match_cache);
6854  }
6855  }
6856  // If the array is a reasonably short list of substrings, convert it into a
6857  // list of internalized strings.
6858  if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6859  for (int i = 0; i < value_array->length(); i++) {
6860  Handle<String> str(String::cast(value_array->get(i)), isolate);
6861  Handle<String> internalized_str = factory->InternalizeString(str);
6862  value_array->set(i, *internalized_str);
6863  }
6864  }
6865  // Convert backing store to a copy-on-write array.
6866  value_array->set_map_no_write_barrier(
6867  ReadOnlyRoots(isolate).fixed_cow_array_map());
6868 }
6869 
6870 void RegExpResultsCache::Clear(FixedArray cache) {
6871  for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6872  cache->set(i, Smi::kZero);
6873  }
6874 }
6875 
6876 } // namespace internal
6877 } // namespace v8
Definition: libplatform.h:13