V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
regexp-macro-assembler.cc
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-macro-assembler.h"
6 
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11 #include "src/unicode-inl.h"
12 
13 #ifdef V8_INTL_SUPPORT
14 #include "unicode/uchar.h"
15 #endif // V8_INTL_SUPPORT
16 
17 namespace v8 {
18 namespace internal {
19 
20 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
21  : slow_safe_compiler_(false),
22  global_mode_(NOT_GLOBAL),
23  isolate_(isolate),
24  zone_(zone) {}
25 
26 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
27 
28 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
29  Address byte_offset2,
30  size_t byte_length,
31  Isolate* isolate) {
33  isolate->regexp_macro_assembler_canonicalize();
34  // This function is not allowed to cause a garbage collection.
35  // A GC might move the calling generated code and invalidate the
36  // return address on the stack.
37  DCHECK_EQ(0, byte_length % 2);
38  uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
39  uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
40  size_t length = byte_length >> 1;
41 
42 #ifdef V8_INTL_SUPPORT
43  if (isolate == nullptr) {
44  for (size_t i = 0; i < length; i++) {
45  uc32 c1 = substring1[i];
46  uc32 c2 = substring2[i];
47  if (unibrow::Utf16::IsLeadSurrogate(c1)) {
48  // Non-BMP characters do not have case-equivalents in the BMP.
49  // Both have to be non-BMP for them to be able to match.
50  if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
51  if (i + 1 < length) {
52  uc16 c1t = substring1[i + 1];
53  uc16 c2t = substring2[i + 1];
54  if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
55  unibrow::Utf16::IsTrailSurrogate(c2t)) {
56  c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
57  c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
58  i++;
59  }
60  }
61  }
62  c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
63  c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
64  if (c1 != c2) return 0;
65  }
66  return 1;
67  }
68 #endif // V8_INTL_SUPPORT
69  DCHECK_NOT_NULL(isolate);
70  for (size_t i = 0; i < length; i++) {
71  unibrow::uchar c1 = substring1[i];
72  unibrow::uchar c2 = substring2[i];
73  if (c1 != c2) {
74  unibrow::uchar s1[1] = {c1};
75  canonicalize->get(c1, '\0', s1);
76  if (s1[0] != c2) {
77  unibrow::uchar s2[1] = {c2};
78  canonicalize->get(c2, '\0', s2);
79  if (s1[0] != s2[0]) {
80  return 0;
81  }
82  }
83  }
84  }
85  return 1;
86 }
87 
88 
89 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
90  Label* on_failure) {
91  Label ok;
92  // Check that current character is not a trail surrogate.
93  LoadCurrentCharacter(cp_offset, &ok);
94  CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
95  // Check that previous character is not a lead surrogate.
96  LoadCurrentCharacter(cp_offset - 1, &ok);
97  CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
98  Bind(&ok);
99 }
100 
101 void RegExpMacroAssembler::CheckPosition(int cp_offset,
102  Label* on_outside_input) {
103  LoadCurrentCharacter(cp_offset, on_outside_input, true);
104 }
105 
106 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
107  Label* on_no_match) {
108  return false;
109 }
110 
111 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
112 
113 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
114  Zone* zone)
115  : RegExpMacroAssembler(isolate, zone) {}
116 
117 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
118 
119 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
120  return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
121 }
122 
123 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
124  String subject, int start_index) {
125  if (subject->IsConsString()) {
126  subject = ConsString::cast(subject)->first();
127  } else if (subject->IsSlicedString()) {
128  start_index += SlicedString::cast(subject)->offset();
129  subject = SlicedString::cast(subject)->parent();
130  }
131  if (subject->IsThinString()) {
132  subject = ThinString::cast(subject)->actual();
133  }
134  DCHECK_LE(0, start_index);
135  DCHECK_LE(start_index, subject->length());
136  if (subject->IsSeqOneByteString()) {
137  return reinterpret_cast<const byte*>(
138  SeqOneByteString::cast(subject)->GetChars() + start_index);
139  } else if (subject->IsSeqTwoByteString()) {
140  return reinterpret_cast<const byte*>(
141  SeqTwoByteString::cast(subject)->GetChars() + start_index);
142  } else if (subject->IsExternalOneByteString()) {
143  return reinterpret_cast<const byte*>(
144  ExternalOneByteString::cast(subject)->GetChars() + start_index);
145  } else {
146  DCHECK(subject->IsExternalTwoByteString());
147  return reinterpret_cast<const byte*>(
148  ExternalTwoByteString::cast(subject)->GetChars() + start_index);
149  }
150 }
151 
152 int NativeRegExpMacroAssembler::CheckStackGuardState(
153  Isolate* isolate, int start_index, bool is_direct_call,
154  Address* return_address, Code re_code, Address* subject,
155  const byte** input_start, const byte** input_end) {
156  AllowHeapAllocation allow_allocation;
157  DCHECK(re_code->raw_instruction_start() <= *return_address);
158  DCHECK(*return_address <= re_code->raw_instruction_end());
159  int return_value = 0;
160  // Prepare for possible GC.
161  HandleScope handles(isolate);
162  Handle<Code> code_handle(re_code, isolate);
163  Handle<String> subject_handle(String::cast(ObjectPtr(*subject)), isolate);
164  bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
165 
166  StackLimitCheck check(isolate);
167  bool js_has_overflowed = check.JsHasOverflowed();
168 
169  if (is_direct_call) {
170  // Direct calls from JavaScript can be interrupted in two ways:
171  // 1. A real stack overflow, in which case we let the caller throw the
172  // exception.
173  // 2. The stack guard was used to interrupt execution for another purpose,
174  // forcing the call through the runtime system.
175  return_value = js_has_overflowed ? EXCEPTION : RETRY;
176  } else if (js_has_overflowed) {
177  isolate->StackOverflow();
178  return_value = EXCEPTION;
179  } else {
180  Object* result = isolate->stack_guard()->HandleInterrupts();
181  if (result->IsException(isolate)) return_value = EXCEPTION;
182  }
183 
184  DisallowHeapAllocation no_gc;
185 
186  if (*code_handle != re_code) { // Return address no longer valid
187  intptr_t delta = code_handle->address() - re_code->address();
188  // Overwrite the return address on the stack.
189  *return_address += delta;
190  }
191 
192  // If we continue, we need to update the subject string addresses.
193  if (return_value == 0) {
194  // String encoding might have changed.
195  if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
196  // If we changed between an LATIN1 and an UC16 string, the specialized
197  // code cannot be used, and we need to restart regexp matching from
198  // scratch (including, potentially, compiling a new version of the code).
199  return_value = RETRY;
200  } else {
201  *subject = subject_handle->ptr();
202  intptr_t byte_length = *input_end - *input_start;
203  *input_start = StringCharacterPosition(*subject_handle, start_index);
204  *input_end = *input_start + byte_length;
205  }
206  }
207  return return_value;
208 }
209 
210 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
211  Handle<Code> regexp_code,
212  Handle<String> subject,
213  int* offsets_vector,
214  int offsets_vector_length,
215  int previous_index,
216  Isolate* isolate) {
217 
218  DCHECK(subject->IsFlat());
219  DCHECK_LE(0, previous_index);
220  DCHECK_LE(previous_index, subject->length());
221 
222  // No allocations before calling the regexp, but we can't use
223  // DisallowHeapAllocation, since regexps might be preempted, and another
224  // thread might do allocation anyway.
225 
226  String subject_ptr = *subject;
227  // Character offsets into string.
228  int start_offset = previous_index;
229  int char_length = subject_ptr->length() - start_offset;
230  int slice_offset = 0;
231 
232  // The string has been flattened, so if it is a cons string it contains the
233  // full string in the first part.
234  if (StringShape(subject_ptr).IsCons()) {
235  DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
236  subject_ptr = ConsString::cast(subject_ptr)->first();
237  } else if (StringShape(subject_ptr).IsSliced()) {
238  SlicedString slice = SlicedString::cast(subject_ptr);
239  subject_ptr = slice->parent();
240  slice_offset = slice->offset();
241  }
242  if (StringShape(subject_ptr).IsThin()) {
243  subject_ptr = ThinString::cast(subject_ptr)->actual();
244  }
245  // Ensure that an underlying string has the same representation.
246  bool is_one_byte = subject_ptr->IsOneByteRepresentation();
247  DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
248  // String is now either Sequential or External
249  int char_size_shift = is_one_byte ? 0 : 1;
250 
251  DisallowHeapAllocation no_gc;
252  const byte* input_start =
253  StringCharacterPosition(subject_ptr, start_offset + slice_offset);
254  int byte_length = char_length << char_size_shift;
255  const byte* input_end = input_start + byte_length;
256  Result res = Execute(*regexp_code,
257  *subject,
258  start_offset,
259  input_start,
260  input_end,
261  offsets_vector,
262  offsets_vector_length,
263  isolate);
264  return res;
265 }
266 
267 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
268  Code code,
269  String input, // This needs to be the unpacked (sliced, cons) string.
270  int start_offset, const byte* input_start, const byte* input_end,
271  int* output, int output_size, Isolate* isolate) {
272  // Ensure that the minimum stack has been allocated.
273  RegExpStackScope stack_scope(isolate);
274  Address stack_base = stack_scope.stack()->stack_base();
275 
276  int direct_call = 0;
277 
278  using RegexpMatcherSig = int(
279  Address input_string, int start_offset, // NOLINT(readability/casting)
280  const byte* input_start, const byte* input_end, int* output,
281  int output_size, Address stack_base, int direct_call, Isolate* isolate);
282 
283  auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
284  int result = fn.Call(input.ptr(), start_offset, input_start, input_end,
285  output, output_size, stack_base, direct_call, isolate);
286  DCHECK(result >= RETRY);
287 
288  if (result == EXCEPTION && !isolate->has_pending_exception()) {
289  // We detected a stack overflow (on the backtrack stack) in RegExp code,
290  // but haven't created the exception yet. Additionally, we allow heap
291  // allocation because even though it invalidates {input_start} and
292  // {input_end}, we are about to return anyway.
293  AllowHeapAllocation allow_allocation;
294  isolate->StackOverflow();
295  }
296  return static_cast<Result>(result);
297 }
298 
299 // clang-format off
300 const byte NativeRegExpMacroAssembler::word_character_map[] = {
301  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
302  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
303  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
305 
306  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
308  0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
309  0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
310 
311  0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
312  0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
313  0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
314  0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
315 
316  0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
317  0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
318  0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
319  0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
320  // Latin-1 range
321  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325 
326  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330 
331  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335 
336  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339  0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
340 };
341 // clang-format on
342 
343 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
344  Address* stack_base,
345  Isolate* isolate) {
346  RegExpStack* regexp_stack = isolate->regexp_stack();
347  size_t size = regexp_stack->stack_capacity();
348  Address old_stack_base = regexp_stack->stack_base();
349  DCHECK(old_stack_base == *stack_base);
350  DCHECK(stack_pointer <= old_stack_base);
351  DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
352  Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
353  if (new_stack_base == kNullAddress) {
354  return kNullAddress;
355  }
356  *stack_base = new_stack_base;
357  intptr_t stack_content_size = old_stack_base - stack_pointer;
358  return new_stack_base - stack_content_size;
359 }
360 
361 #endif // V8_INTERPRETED_REGEXP
362 
363 } // namespace internal
364 } // namespace v8
Definition: libplatform.h:13