V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
regexp-parser.cc
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include <vector>
8 
9 #include "src/char-predicates-inl.h"
10 #include "src/heap/factory.h"
11 #include "src/isolate.h"
12 #include "src/objects-inl.h"
13 #include "src/ostreams.h"
14 #include "src/regexp/jsregexp.h"
15 #include "src/regexp/property-sequences.h"
16 #include "src/utils.h"
17 
18 #ifdef V8_INTL_SUPPORT
19 #include "unicode/uniset.h"
20 #endif // V8_INTL_SUPPORT
21 
22 namespace v8 {
23 namespace internal {
24 
25 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
26  JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
27  : isolate_(isolate),
28  zone_(zone),
29  error_(error),
30  captures_(nullptr),
31  named_captures_(nullptr),
32  named_back_references_(nullptr),
33  in_(in),
34  current_(kEndMarker),
35  top_level_flags_(flags),
36  next_pos_(0),
37  captures_started_(0),
38  capture_count_(0),
39  has_more_(true),
40  simple_(false),
41  contains_anchor_(false),
42  is_scanned_for_captures_(false),
43  has_named_captures_(false),
44  failed_(false) {
45  Advance();
46 }
47 
48 template <bool update_position>
49 inline uc32 RegExpParser::ReadNext() {
50  int position = next_pos_;
51  uc32 c0 = in()->Get(position);
52  position++;
53  // Read the whole surrogate pair in case of unicode flag, if possible.
54  if (unicode() && position < in()->length() &&
55  unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
56  uc16 c1 = in()->Get(position);
57  if (unibrow::Utf16::IsTrailSurrogate(c1)) {
58  c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
59  position++;
60  }
61  }
62  if (update_position) next_pos_ = position;
63  return c0;
64 }
65 
66 
67 uc32 RegExpParser::Next() {
68  if (has_next()) {
69  return ReadNext<false>();
70  } else {
71  return kEndMarker;
72  }
73 }
74 
75 void RegExpParser::Advance() {
76  if (has_next()) {
77  StackLimitCheck check(isolate());
78  if (check.HasOverflowed()) {
79  if (FLAG_abort_on_stack_or_string_length_overflow) {
80  FATAL("Aborting on stack overflow");
81  }
82  ReportError(CStrVector(
83  MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
84  } else if (zone()->excess_allocation()) {
85  ReportError(CStrVector("Regular expression too large"));
86  } else {
87  current_ = ReadNext<true>();
88  }
89  } else {
90  current_ = kEndMarker;
91  // Advance so that position() points to 1-after-the-last-character. This is
92  // important so that Reset() to this position works correctly.
93  next_pos_ = in()->length() + 1;
94  has_more_ = false;
95  }
96 }
97 
98 
99 void RegExpParser::Reset(int pos) {
100  next_pos_ = pos;
101  has_more_ = (pos < in()->length());
102  Advance();
103 }
104 
105 void RegExpParser::Advance(int dist) {
106  next_pos_ += dist - 1;
107  Advance();
108 }
109 
110 
111 bool RegExpParser::simple() { return simple_; }
112 
113 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
114  switch (c) {
115  case '^':
116  case '$':
117  case '\\':
118  case '.':
119  case '*':
120  case '+':
121  case '?':
122  case '(':
123  case ')':
124  case '[':
125  case ']':
126  case '{':
127  case '}':
128  case '|':
129  case '/':
130  return true;
131  default:
132  break;
133  }
134  return false;
135 }
136 
137 
138 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
139  if (failed_) return nullptr; // Do not overwrite any existing error.
140  failed_ = true;
141  *error_ = isolate()
142  ->factory()
143  ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
144  .ToHandleChecked();
145  // Zip to the end to make sure the no more input is read.
146  current_ = kEndMarker;
147  next_pos_ = in()->length();
148  return nullptr;
149 }
150 
151 #define CHECK_FAILED ); \
152  if (failed_) return nullptr; \
153  ((void)0
154 
155 // Pattern ::
156 // Disjunction
157 RegExpTree* RegExpParser::ParsePattern() {
158  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
159  PatchNamedBackReferences(CHECK_FAILED);
160  DCHECK(!has_more());
161  // If the result of parsing is a literal string atom, and it has the
162  // same length as the input, then the atom is identical to the input.
163  if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
164  simple_ = true;
165  }
166  return result;
167 }
168 
169 
170 // Disjunction ::
171 // Alternative
172 // Alternative | Disjunction
173 // Alternative ::
174 // [empty]
175 // Term Alternative
176 // Term ::
177 // Assertion
178 // Atom
179 // Atom Quantifier
180 RegExpTree* RegExpParser::ParseDisjunction() {
181  // Used to store current state while parsing subexpressions.
182  RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
183  0, nullptr, top_level_flags_, zone());
184  RegExpParserState* state = &initial_state;
185  // Cache the builder in a local variable for quick access.
186  RegExpBuilder* builder = initial_state.builder();
187  while (true) {
188  switch (current()) {
189  case kEndMarker:
190  if (state->IsSubexpression()) {
191  // Inside a parenthesized group when hitting end of input.
192  return ReportError(CStrVector("Unterminated group"));
193  }
194  DCHECK_EQ(INITIAL, state->group_type());
195  // Parsing completed successfully.
196  return builder->ToRegExp();
197  case ')': {
198  if (!state->IsSubexpression()) {
199  return ReportError(CStrVector("Unmatched ')'"));
200  }
201  DCHECK_NE(INITIAL, state->group_type());
202 
203  Advance();
204  // End disjunction parsing and convert builder content to new single
205  // regexp atom.
206  RegExpTree* body = builder->ToRegExp();
207 
208  int end_capture_index = captures_started();
209 
210  int capture_index = state->capture_index();
211  SubexpressionType group_type = state->group_type();
212 
213  // Build result of subexpression.
214  if (group_type == CAPTURE) {
215  if (state->IsNamedCapture()) {
216  CreateNamedCaptureAtIndex(state->capture_name(),
217  capture_index CHECK_FAILED);
218  }
219  RegExpCapture* capture = GetCapture(capture_index);
220  capture->set_body(body);
221  body = capture;
222  } else if (group_type == GROUPING) {
223  body = new (zone()) RegExpGroup(body);
224  } else {
225  DCHECK(group_type == POSITIVE_LOOKAROUND ||
226  group_type == NEGATIVE_LOOKAROUND);
227  bool is_positive = (group_type == POSITIVE_LOOKAROUND);
228  body = new (zone()) RegExpLookaround(
229  body, is_positive, end_capture_index - capture_index,
230  capture_index, state->lookaround_type());
231  }
232 
233  // Restore previous state.
234  state = state->previous_state();
235  builder = state->builder();
236 
237  builder->AddAtom(body);
238  // For compatibility with JSC and ES3, we allow quantifiers after
239  // lookaheads, and break in all cases.
240  break;
241  }
242  case '|': {
243  Advance();
244  builder->NewAlternative();
245  continue;
246  }
247  case '*':
248  case '+':
249  case '?':
250  return ReportError(CStrVector("Nothing to repeat"));
251  case '^': {
252  Advance();
253  if (builder->multiline()) {
254  builder->AddAssertion(new (zone()) RegExpAssertion(
255  RegExpAssertion::START_OF_LINE, builder->flags()));
256  } else {
257  builder->AddAssertion(new (zone()) RegExpAssertion(
258  RegExpAssertion::START_OF_INPUT, builder->flags()));
259  set_contains_anchor();
260  }
261  continue;
262  }
263  case '$': {
264  Advance();
265  RegExpAssertion::AssertionType assertion_type =
266  builder->multiline() ? RegExpAssertion::END_OF_LINE
267  : RegExpAssertion::END_OF_INPUT;
268  builder->AddAssertion(
269  new (zone()) RegExpAssertion(assertion_type, builder->flags()));
270  continue;
271  }
272  case '.': {
273  Advance();
274  ZoneList<CharacterRange>* ranges =
275  new (zone()) ZoneList<CharacterRange>(2, zone());
276 
277  if (builder->dotall()) {
278  // Everything.
279  CharacterRange::AddClassEscape('*', ranges, false, zone());
280  } else {
281  // Everything except \x0A, \x0D, \u2028 and \u2029
282  CharacterRange::AddClassEscape('.', ranges, false, zone());
283  }
284 
285  RegExpCharacterClass* cc =
286  new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
287  builder->AddCharacterClass(cc);
288  break;
289  }
290  case '(': {
291  state = ParseOpenParenthesis(state CHECK_FAILED);
292  builder = state->builder();
293  continue;
294  }
295  case '[': {
296  RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
297  builder->AddCharacterClass(cc->AsCharacterClass());
298  break;
299  }
300  // Atom ::
301  // \ AtomEscape
302  case '\\':
303  switch (Next()) {
304  case kEndMarker:
305  return ReportError(CStrVector("\\ at end of pattern"));
306  case 'b':
307  Advance(2);
308  builder->AddAssertion(new (zone()) RegExpAssertion(
309  RegExpAssertion::BOUNDARY, builder->flags()));
310  continue;
311  case 'B':
312  Advance(2);
313  builder->AddAssertion(new (zone()) RegExpAssertion(
314  RegExpAssertion::NON_BOUNDARY, builder->flags()));
315  continue;
316  // AtomEscape ::
317  // CharacterClassEscape
318  //
319  // CharacterClassEscape :: one of
320  // d D s S w W
321  case 'd':
322  case 'D':
323  case 's':
324  case 'S':
325  case 'w':
326  case 'W': {
327  uc32 c = Next();
328  Advance(2);
329  ZoneList<CharacterRange>* ranges =
330  new (zone()) ZoneList<CharacterRange>(2, zone());
331  CharacterRange::AddClassEscape(
332  c, ranges, unicode() && builder->ignore_case(), zone());
333  RegExpCharacterClass* cc = new (zone())
334  RegExpCharacterClass(zone(), ranges, builder->flags());
335  builder->AddCharacterClass(cc);
336  break;
337  }
338  case 'p':
339  case 'P': {
340  uc32 p = Next();
341  Advance(2);
342  if (unicode()) {
343  ZoneList<CharacterRange>* ranges =
344  new (zone()) ZoneList<CharacterRange>(2, zone());
345  std::vector<char> name_1, name_2;
346  if (ParsePropertyClassName(&name_1, &name_2)) {
347  if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
348  RegExpCharacterClass* cc = new (zone())
349  RegExpCharacterClass(zone(), ranges, builder->flags());
350  builder->AddCharacterClass(cc);
351  break;
352  }
353  if (p == 'p' && name_2.empty()) {
354  RegExpTree* sequence = GetPropertySequence(name_1);
355  if (sequence != nullptr) {
356  builder->AddAtom(sequence);
357  break;
358  }
359  }
360  }
361  return ReportError(CStrVector("Invalid property name"));
362  } else {
363  builder->AddCharacter(p);
364  }
365  break;
366  }
367  case '1':
368  case '2':
369  case '3':
370  case '4':
371  case '5':
372  case '6':
373  case '7':
374  case '8':
375  case '9': {
376  int index = 0;
377  bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
378  if (is_backref) {
379  if (state->IsInsideCaptureGroup(index)) {
380  // The back reference is inside the capture group it refers to.
381  // Nothing can possibly have been captured yet, so we use empty
382  // instead. This ensures that, when checking a back reference,
383  // the capture registers of the referenced capture are either
384  // both set or both cleared.
385  builder->AddEmpty();
386  } else {
387  RegExpCapture* capture = GetCapture(index);
388  RegExpTree* atom =
389  new (zone()) RegExpBackReference(capture, builder->flags());
390  builder->AddAtom(atom);
391  }
392  break;
393  }
394  // With /u, no identity escapes except for syntax characters
395  // are allowed. Otherwise, all identity escapes are allowed.
396  if (unicode()) {
397  return ReportError(CStrVector("Invalid escape"));
398  }
399  uc32 first_digit = Next();
400  if (first_digit == '8' || first_digit == '9') {
401  builder->AddCharacter(first_digit);
402  Advance(2);
403  break;
404  }
405  V8_FALLTHROUGH;
406  }
407  case '0': {
408  Advance();
409  if (unicode() && Next() >= '0' && Next() <= '9') {
410  // With /u, decimal escape with leading 0 are not parsed as octal.
411  return ReportError(CStrVector("Invalid decimal escape"));
412  }
413  uc32 octal = ParseOctalLiteral();
414  builder->AddCharacter(octal);
415  break;
416  }
417  // ControlEscape :: one of
418  // f n r t v
419  case 'f':
420  Advance(2);
421  builder->AddCharacter('\f');
422  break;
423  case 'n':
424  Advance(2);
425  builder->AddCharacter('\n');
426  break;
427  case 'r':
428  Advance(2);
429  builder->AddCharacter('\r');
430  break;
431  case 't':
432  Advance(2);
433  builder->AddCharacter('\t');
434  break;
435  case 'v':
436  Advance(2);
437  builder->AddCharacter('\v');
438  break;
439  case 'c': {
440  Advance();
441  uc32 controlLetter = Next();
442  // Special case if it is an ASCII letter.
443  // Convert lower case letters to uppercase.
444  uc32 letter = controlLetter & ~('a' ^ 'A');
445  if (letter < 'A' || 'Z' < letter) {
446  // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
447  // Read the backslash as a literal character instead of as
448  // starting an escape.
449  // ES#prod-annexB-ExtendedPatternCharacter
450  if (unicode()) {
451  // With /u, invalid escapes are not treated as identity escapes.
452  return ReportError(CStrVector("Invalid unicode escape"));
453  }
454  builder->AddCharacter('\\');
455  } else {
456  Advance(2);
457  builder->AddCharacter(controlLetter & 0x1F);
458  }
459  break;
460  }
461  case 'x': {
462  Advance(2);
463  uc32 value;
464  if (ParseHexEscape(2, &value)) {
465  builder->AddCharacter(value);
466  } else if (!unicode()) {
467  builder->AddCharacter('x');
468  } else {
469  // With /u, invalid escapes are not treated as identity escapes.
470  return ReportError(CStrVector("Invalid escape"));
471  }
472  break;
473  }
474  case 'u': {
475  Advance(2);
476  uc32 value;
477  if (ParseUnicodeEscape(&value)) {
478  builder->AddEscapedUnicodeCharacter(value);
479  } else if (!unicode()) {
480  builder->AddCharacter('u');
481  } else {
482  // With /u, invalid escapes are not treated as identity escapes.
483  return ReportError(CStrVector("Invalid Unicode escape"));
484  }
485  break;
486  }
487  case 'k':
488  // Either an identity escape or a named back-reference. The two
489  // interpretations are mutually exclusive: '\k' is interpreted as
490  // an identity escape for non-Unicode patterns without named
491  // capture groups, and as the beginning of a named back-reference
492  // in all other cases.
493  if (unicode() || HasNamedCaptures()) {
494  Advance(2);
495  ParseNamedBackReference(builder, state CHECK_FAILED);
496  break;
497  }
498  V8_FALLTHROUGH;
499  default:
500  Advance();
501  // With /u, no identity escapes except for syntax characters
502  // are allowed. Otherwise, all identity escapes are allowed.
503  if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
504  builder->AddCharacter(current());
505  Advance();
506  } else {
507  return ReportError(CStrVector("Invalid escape"));
508  }
509  break;
510  }
511  break;
512  case '{': {
513  int dummy;
514  bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
515  if (parsed) return ReportError(CStrVector("Nothing to repeat"));
516  V8_FALLTHROUGH;
517  }
518  case '}':
519  case ']':
520  if (unicode()) {
521  return ReportError(CStrVector("Lone quantifier brackets"));
522  }
523  V8_FALLTHROUGH;
524  default:
525  builder->AddUnicodeCharacter(current());
526  Advance();
527  break;
528  } // end switch(current())
529 
530  int min;
531  int max;
532  switch (current()) {
533  // QuantifierPrefix ::
534  // *
535  // +
536  // ?
537  // {
538  case '*':
539  min = 0;
540  max = RegExpTree::kInfinity;
541  Advance();
542  break;
543  case '+':
544  min = 1;
545  max = RegExpTree::kInfinity;
546  Advance();
547  break;
548  case '?':
549  min = 0;
550  max = 1;
551  Advance();
552  break;
553  case '{':
554  if (ParseIntervalQuantifier(&min, &max)) {
555  if (max < min) {
556  return ReportError(
557  CStrVector("numbers out of order in {} quantifier"));
558  }
559  break;
560  } else if (unicode()) {
561  // With /u, incomplete quantifiers are not allowed.
562  return ReportError(CStrVector("Incomplete quantifier"));
563  }
564  continue;
565  default:
566  continue;
567  }
568  RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
569  if (current() == '?') {
570  quantifier_type = RegExpQuantifier::NON_GREEDY;
571  Advance();
572  } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
573  // FLAG_regexp_possessive_quantifier is a debug-only flag.
574  quantifier_type = RegExpQuantifier::POSSESSIVE;
575  Advance();
576  }
577  if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
578  return ReportError(CStrVector("Invalid quantifier"));
579  }
580  }
581 }
582 
583 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
584  RegExpParserState* state) {
585  RegExpLookaround::Type lookaround_type = state->lookaround_type();
586  bool is_named_capture = false;
587  JSRegExp::Flags switch_on = JSRegExp::kNone;
588  JSRegExp::Flags switch_off = JSRegExp::kNone;
589  const ZoneVector<uc16>* capture_name = nullptr;
590  SubexpressionType subexpr_type = CAPTURE;
591  Advance();
592  if (current() == '?') {
593  switch (Next()) {
594  case ':':
595  Advance(2);
596  subexpr_type = GROUPING;
597  break;
598  case '=':
599  Advance(2);
600  lookaround_type = RegExpLookaround::LOOKAHEAD;
601  subexpr_type = POSITIVE_LOOKAROUND;
602  break;
603  case '!':
604  Advance(2);
605  lookaround_type = RegExpLookaround::LOOKAHEAD;
606  subexpr_type = NEGATIVE_LOOKAROUND;
607  break;
608  case '-':
609  case 'i':
610  case 's':
611  case 'm': {
612  if (!FLAG_regexp_mode_modifiers) {
613  ReportError(CStrVector("Invalid group"));
614  return nullptr;
615  }
616  Advance();
617  bool flags_sense = true; // Switching on flags.
618  while (subexpr_type != GROUPING) {
619  switch (current()) {
620  case '-':
621  if (!flags_sense) {
622  ReportError(CStrVector("Multiple dashes in flag group"));
623  return nullptr;
624  }
625  flags_sense = false;
626  Advance();
627  continue;
628  case 's':
629  case 'i':
630  case 'm': {
631  JSRegExp::Flags bit = JSRegExp::kUnicode;
632  if (current() == 'i') bit = JSRegExp::kIgnoreCase;
633  if (current() == 'm') bit = JSRegExp::kMultiline;
634  if (current() == 's') bit = JSRegExp::kDotAll;
635  if (((switch_on | switch_off) & bit) != 0) {
636  ReportError(CStrVector("Repeated flag in flag group"));
637  return nullptr;
638  }
639  if (flags_sense) {
640  switch_on |= bit;
641  } else {
642  switch_off |= bit;
643  }
644  Advance();
645  continue;
646  }
647  case ')': {
648  Advance();
649  state->builder()
650  ->FlushText(); // Flush pending text using old flags.
651  // These (?i)-style flag switches don't put us in a subexpression
652  // at all, they just modify the flags in the rest of the current
653  // subexpression.
654  JSRegExp::Flags flags =
655  (state->builder()->flags() | switch_on) & ~switch_off;
656  state->builder()->set_flags(flags);
657  return state;
658  }
659  case ':':
660  Advance();
661  subexpr_type = GROUPING; // Will break us out of the outer loop.
662  continue;
663  default:
664  ReportError(CStrVector("Invalid flag group"));
665  return nullptr;
666  }
667  }
668  break;
669  }
670  case '<':
671  Advance();
672  if (Next() == '=') {
673  Advance(2);
674  lookaround_type = RegExpLookaround::LOOKBEHIND;
675  subexpr_type = POSITIVE_LOOKAROUND;
676  break;
677  } else if (Next() == '!') {
678  Advance(2);
679  lookaround_type = RegExpLookaround::LOOKBEHIND;
680  subexpr_type = NEGATIVE_LOOKAROUND;
681  break;
682  }
683  is_named_capture = true;
684  has_named_captures_ = true;
685  Advance();
686  break;
687  default:
688  ReportError(CStrVector("Invalid group"));
689  return nullptr;
690  }
691  }
692  if (subexpr_type == CAPTURE) {
693  if (captures_started_ >= kMaxCaptures) {
694  ReportError(CStrVector("Too many captures"));
695  return nullptr;
696  }
697  captures_started_++;
698 
699  if (is_named_capture) {
700  capture_name = ParseCaptureGroupName(CHECK_FAILED);
701  }
702  }
703  JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
704  // Store current state and begin new disjunction parsing.
705  return new (zone())
706  RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
707  capture_name, flags, zone());
708 }
709 
710 #ifdef DEBUG
711 // Currently only used in an DCHECK.
712 static bool IsSpecialClassEscape(uc32 c) {
713  switch (c) {
714  case 'd':
715  case 'D':
716  case 's':
717  case 'S':
718  case 'w':
719  case 'W':
720  return true;
721  default:
722  return false;
723  }
724 }
725 #endif
726 
727 
728 // In order to know whether an escape is a backreference or not we have to scan
729 // the entire regexp and find the number of capturing parentheses. However we
730 // don't want to scan the regexp twice unless it is necessary. This mini-parser
731 // is called when needed. It can see the difference between capturing and
732 // noncapturing parentheses and can skip character classes and backslash-escaped
733 // characters.
734 void RegExpParser::ScanForCaptures() {
735  DCHECK(!is_scanned_for_captures_);
736  const int saved_position = position();
737  // Start with captures started previous to current position
738  int capture_count = captures_started();
739  // Add count of captures after this position.
740  int n;
741  while ((n = current()) != kEndMarker) {
742  Advance();
743  switch (n) {
744  case '\\':
745  Advance();
746  break;
747  case '[': {
748  int c;
749  while ((c = current()) != kEndMarker) {
750  Advance();
751  if (c == '\\') {
752  Advance();
753  } else {
754  if (c == ']') break;
755  }
756  }
757  break;
758  }
759  case '(':
760  if (current() == '?') {
761  // At this point we could be in
762  // * a non-capturing group '(:',
763  // * a lookbehind assertion '(?<=' '(?<!'
764  // * or a named capture '(?<'.
765  //
766  // Of these, only named captures are capturing groups.
767 
768  Advance();
769  if (current() != '<') break;
770 
771  Advance();
772  if (current() == '=' || current() == '!') break;
773 
774  // Found a possible named capture. It could turn out to be a syntax
775  // error (e.g. an unterminated or invalid name), but that distinction
776  // does not matter for our purposes.
777  has_named_captures_ = true;
778  }
779  capture_count++;
780  break;
781  }
782  }
783  capture_count_ = capture_count;
784  is_scanned_for_captures_ = true;
785  Reset(saved_position);
786 }
787 
788 
789 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
790  DCHECK_EQ('\\', current());
791  DCHECK('1' <= Next() && Next() <= '9');
792  // Try to parse a decimal literal that is no greater than the total number
793  // of left capturing parentheses in the input.
794  int start = position();
795  int value = Next() - '0';
796  Advance(2);
797  while (true) {
798  uc32 c = current();
799  if (IsDecimalDigit(c)) {
800  value = 10 * value + (c - '0');
801  if (value > kMaxCaptures) {
802  Reset(start);
803  return false;
804  }
805  Advance();
806  } else {
807  break;
808  }
809  }
810  if (value > captures_started()) {
811  if (!is_scanned_for_captures_) ScanForCaptures();
812  if (value > capture_count_) {
813  Reset(start);
814  return false;
815  }
816  }
817  *index_out = value;
818  return true;
819 }
820 
821 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
822  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
823  v->push_back(code_unit);
824  } else {
825  v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
826  v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
827  }
828 }
829 
830 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
831  ZoneVector<uc16>* name =
832  new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
833 
834  bool at_start = true;
835  while (true) {
836  uc32 c = current();
837  Advance();
838 
839  // Convert unicode escapes.
840  if (c == '\\' && current() == 'u') {
841  Advance();
842  if (!ParseUnicodeEscape(&c)) {
843  ReportError(CStrVector("Invalid Unicode escape sequence"));
844  return nullptr;
845  }
846  }
847 
848  // The backslash char is misclassified as both ID_Start and ID_Continue.
849  if (c == '\\') {
850  ReportError(CStrVector("Invalid capture group name"));
851  return nullptr;
852  }
853 
854  if (at_start) {
855  if (!IsIdentifierStart(c)) {
856  ReportError(CStrVector("Invalid capture group name"));
857  return nullptr;
858  }
859  push_code_unit(name, c);
860  at_start = false;
861  } else {
862  if (c == '>') {
863  break;
864  } else if (IsIdentifierPart(c)) {
865  push_code_unit(name, c);
866  } else {
867  ReportError(CStrVector("Invalid capture group name"));
868  return nullptr;
869  }
870  }
871  }
872 
873  return name;
874 }
875 
876 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
877  int index) {
878  DCHECK(0 < index && index <= captures_started_);
879  DCHECK_NOT_NULL(name);
880 
881  if (named_captures_ == nullptr) {
882  named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
883  } else {
884  // Check for duplicates and bail if we find any.
885  // TODO(jgruber): O(n^2).
886  for (const auto& named_capture : *named_captures_) {
887  if (*named_capture->name() == *name) {
888  ReportError(CStrVector("Duplicate capture group name"));
889  return false;
890  }
891  }
892  }
893 
894  RegExpCapture* capture = GetCapture(index);
895  DCHECK_NULL(capture->name());
896 
897  capture->set_name(name);
898  named_captures_->Add(capture, zone());
899 
900  return true;
901 }
902 
903 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
904  RegExpParserState* state) {
905  // The parser is assumed to be on the '<' in \k<name>.
906  if (current() != '<') {
907  ReportError(CStrVector("Invalid named reference"));
908  return false;
909  }
910 
911  Advance();
912  const ZoneVector<uc16>* name = ParseCaptureGroupName();
913  if (name == nullptr) {
914  return false;
915  }
916 
917  if (state->IsInsideCaptureGroup(name)) {
918  builder->AddEmpty();
919  } else {
920  RegExpBackReference* atom =
921  new (zone()) RegExpBackReference(builder->flags());
922  atom->set_name(name);
923 
924  builder->AddAtom(atom);
925 
926  if (named_back_references_ == nullptr) {
927  named_back_references_ =
928  new (zone()) ZoneList<RegExpBackReference*>(1, zone());
929  }
930  named_back_references_->Add(atom, zone());
931  }
932 
933  return true;
934 }
935 
936 void RegExpParser::PatchNamedBackReferences() {
937  if (named_back_references_ == nullptr) return;
938 
939  if (named_captures_ == nullptr) {
940  ReportError(CStrVector("Invalid named capture referenced"));
941  return;
942  }
943 
944  // Look up and patch the actual capture for each named back reference.
945  // TODO(jgruber): O(n^2), optimize if necessary.
946 
947  for (int i = 0; i < named_back_references_->length(); i++) {
948  RegExpBackReference* ref = named_back_references_->at(i);
949 
950  int index = -1;
951  for (const auto& capture : *named_captures_) {
952  if (*capture->name() == *ref->name()) {
953  index = capture->index();
954  break;
955  }
956  }
957 
958  if (index == -1) {
959  ReportError(CStrVector("Invalid named capture referenced"));
960  return;
961  }
962 
963  ref->set_capture(GetCapture(index));
964  }
965 }
966 
967 RegExpCapture* RegExpParser::GetCapture(int index) {
968  // The index for the capture groups are one-based. Its index in the list is
969  // zero-based.
970  int know_captures =
971  is_scanned_for_captures_ ? capture_count_ : captures_started_;
972  DCHECK(index <= know_captures);
973  if (captures_ == nullptr) {
974  captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
975  }
976  while (captures_->length() < know_captures) {
977  captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
978  }
979  return captures_->at(index - 1);
980 }
981 
982 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
983  if (named_captures_ == nullptr || named_captures_->is_empty())
984  return Handle<FixedArray>();
985 
986  Factory* factory = isolate()->factory();
987 
988  int len = named_captures_->length() * 2;
989  Handle<FixedArray> array = factory->NewFixedArray(len);
990 
991  for (int i = 0; i < named_captures_->length(); i++) {
992  RegExpCapture* capture = named_captures_->at(i);
993  MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
994  array->set(i * 2, *name.ToHandleChecked());
995  array->set(i * 2 + 1, Smi::FromInt(capture->index()));
996  }
997 
998  return array;
999 }
1000 
1001 bool RegExpParser::HasNamedCaptures() {
1002  if (has_named_captures_ || is_scanned_for_captures_) {
1003  return has_named_captures_;
1004  }
1005 
1006  ScanForCaptures();
1007  DCHECK(is_scanned_for_captures_);
1008  return has_named_captures_;
1009 }
1010 
1011 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1012  for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1013  if (s->group_type() != CAPTURE) continue;
1014  // Return true if we found the matching capture index.
1015  if (index == s->capture_index()) return true;
1016  // Abort if index is larger than what has been parsed up till this state.
1017  if (index > s->capture_index()) return false;
1018  }
1019  return false;
1020 }
1021 
1022 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1023  const ZoneVector<uc16>* name) {
1024  DCHECK_NOT_NULL(name);
1025  for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1026  if (s->capture_name() == nullptr) continue;
1027  if (*s->capture_name() == *name) return true;
1028  }
1029  return false;
1030 }
1031 
1032 // QuantifierPrefix ::
1033 // { DecimalDigits }
1034 // { DecimalDigits , }
1035 // { DecimalDigits , DecimalDigits }
1036 //
1037 // Returns true if parsing succeeds, and set the min_out and max_out
1038 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1039 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1040  DCHECK_EQ(current(), '{');
1041  int start = position();
1042  Advance();
1043  int min = 0;
1044  if (!IsDecimalDigit(current())) {
1045  Reset(start);
1046  return false;
1047  }
1048  while (IsDecimalDigit(current())) {
1049  int next = current() - '0';
1050  if (min > (RegExpTree::kInfinity - next) / 10) {
1051  // Overflow. Skip past remaining decimal digits and return -1.
1052  do {
1053  Advance();
1054  } while (IsDecimalDigit(current()));
1055  min = RegExpTree::kInfinity;
1056  break;
1057  }
1058  min = 10 * min + next;
1059  Advance();
1060  }
1061  int max = 0;
1062  if (current() == '}') {
1063  max = min;
1064  Advance();
1065  } else if (current() == ',') {
1066  Advance();
1067  if (current() == '}') {
1068  max = RegExpTree::kInfinity;
1069  Advance();
1070  } else {
1071  while (IsDecimalDigit(current())) {
1072  int next = current() - '0';
1073  if (max > (RegExpTree::kInfinity - next) / 10) {
1074  do {
1075  Advance();
1076  } while (IsDecimalDigit(current()));
1077  max = RegExpTree::kInfinity;
1078  break;
1079  }
1080  max = 10 * max + next;
1081  Advance();
1082  }
1083  if (current() != '}') {
1084  Reset(start);
1085  return false;
1086  }
1087  Advance();
1088  }
1089  } else {
1090  Reset(start);
1091  return false;
1092  }
1093  *min_out = min;
1094  *max_out = max;
1095  return true;
1096 }
1097 
1098 
1099 uc32 RegExpParser::ParseOctalLiteral() {
1100  DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1101  // For compatibility with some other browsers (not all), we parse
1102  // up to three octal digits with a value below 256.
1103  // ES#prod-annexB-LegacyOctalEscapeSequence
1104  uc32 value = current() - '0';
1105  Advance();
1106  if ('0' <= current() && current() <= '7') {
1107  value = value * 8 + current() - '0';
1108  Advance();
1109  if (value < 32 && '0' <= current() && current() <= '7') {
1110  value = value * 8 + current() - '0';
1111  Advance();
1112  }
1113  }
1114  return value;
1115 }
1116 
1117 
1118 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1119  int start = position();
1120  uc32 val = 0;
1121  for (int i = 0; i < length; ++i) {
1122  uc32 c = current();
1123  int d = HexValue(c);
1124  if (d < 0) {
1125  Reset(start);
1126  return false;
1127  }
1128  val = val * 16 + d;
1129  Advance();
1130  }
1131  *value = val;
1132  return true;
1133 }
1134 
1135 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1136 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1137  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1138  // allowed). In the latter case, the number of hex digits between { } is
1139  // arbitrary. \ and u have already been read.
1140  if (current() == '{' && unicode()) {
1141  int start = position();
1142  Advance();
1143  if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1144  if (current() == '}') {
1145  Advance();
1146  return true;
1147  }
1148  }
1149  Reset(start);
1150  return false;
1151  }
1152  // \u but no {, or \u{...} escapes not allowed.
1153  bool result = ParseHexEscape(4, value);
1154  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1155  current() == '\\') {
1156  // Attempt to read trail surrogate.
1157  int start = position();
1158  if (Next() == 'u') {
1159  Advance(2);
1160  uc32 trail;
1161  if (ParseHexEscape(4, &trail) &&
1162  unibrow::Utf16::IsTrailSurrogate(trail)) {
1163  *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1164  static_cast<uc16>(trail));
1165  return true;
1166  }
1167  }
1168  Reset(start);
1169  }
1170  return result;
1171 }
1172 
1173 #ifdef V8_INTL_SUPPORT
1174 
1175 namespace {
1176 
1177 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1178  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1179  if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1180  return true;
1181  for (int i = 0;; i++) {
1182  const char* long_name = u_getPropertyName(
1183  property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1184  if (long_name == nullptr) break;
1185  if (strcmp(property_name, long_name) == 0) return true;
1186  }
1187  return false;
1188 }
1189 
1190 bool IsExactPropertyValueAlias(const char* property_value_name,
1191  UProperty property, int32_t property_value) {
1192  const char* short_name =
1193  u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1194  if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1195  return true;
1196  }
1197  for (int i = 0;; i++) {
1198  const char* long_name = u_getPropertyValueName(
1199  property, property_value,
1200  static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1201  if (long_name == nullptr) break;
1202  if (strcmp(property_value_name, long_name) == 0) return true;
1203  }
1204  return false;
1205 }
1206 
1207 bool LookupPropertyValueName(UProperty property,
1208  const char* property_value_name, bool negate,
1209  ZoneList<CharacterRange>* result, Zone* zone) {
1210  UProperty property_for_lookup = property;
1211  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1212  // For the property Script_Extensions, we have to do the property value
1213  // name lookup as if the property is Script.
1214  property_for_lookup = UCHAR_SCRIPT;
1215  }
1216  int32_t property_value =
1217  u_getPropertyValueEnum(property_for_lookup, property_value_name);
1218  if (property_value == UCHAR_INVALID_CODE) return false;
1219 
1220  // We require the property name to match exactly to one of the property value
1221  // aliases. However, u_getPropertyValueEnum uses loose matching.
1222  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1223  property_value)) {
1224  return false;
1225  }
1226 
1227  UErrorCode ec = U_ZERO_ERROR;
1228  icu::UnicodeSet set;
1229  set.applyIntPropertyValue(property, property_value, ec);
1230  bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1231 
1232  if (success) {
1233  set.removeAllStrings();
1234  if (negate) set.complement();
1235  for (int i = 0; i < set.getRangeCount(); i++) {
1236  result->Add(
1237  CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1238  zone);
1239  }
1240  }
1241  return success;
1242 }
1243 
1244 template <size_t N>
1245 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1246  return strncmp(name, literal, N + 1) == 0;
1247 }
1248 
1249 bool LookupSpecialPropertyValueName(const char* name,
1250  ZoneList<CharacterRange>* result,
1251  bool negate, Zone* zone) {
1252  if (NameEquals(name, "Any")) {
1253  if (negate) {
1254  // Leave the list of character ranges empty, since the negation of 'Any'
1255  // is the empty set.
1256  } else {
1257  result->Add(CharacterRange::Everything(), zone);
1258  }
1259  } else if (NameEquals(name, "ASCII")) {
1260  result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1261  : CharacterRange::Range(0x0, 0x7F),
1262  zone);
1263  } else if (NameEquals(name, "Assigned")) {
1264  return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1265  !negate, result, zone);
1266  } else {
1267  return false;
1268  }
1269  return true;
1270 }
1271 
1272 // Explicitly whitelist supported binary properties. The spec forbids supporting
1273 // properties outside of this set to ensure interoperability.
1274 bool IsSupportedBinaryProperty(UProperty property) {
1275  switch (property) {
1276  case UCHAR_ALPHABETIC:
1277  // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1278  // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1279  case UCHAR_ASCII_HEX_DIGIT:
1280  // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1281  case UCHAR_BIDI_CONTROL:
1282  case UCHAR_BIDI_MIRRORED:
1283  case UCHAR_CASE_IGNORABLE:
1284  case UCHAR_CASED:
1285  case UCHAR_CHANGES_WHEN_CASEFOLDED:
1286  case UCHAR_CHANGES_WHEN_CASEMAPPED:
1287  case UCHAR_CHANGES_WHEN_LOWERCASED:
1288  case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1289  case UCHAR_CHANGES_WHEN_TITLECASED:
1290  case UCHAR_CHANGES_WHEN_UPPERCASED:
1291  case UCHAR_DASH:
1292  case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1293  case UCHAR_DEPRECATED:
1294  case UCHAR_DIACRITIC:
1295  case UCHAR_EMOJI:
1296  case UCHAR_EMOJI_COMPONENT:
1297  case UCHAR_EMOJI_MODIFIER_BASE:
1298  case UCHAR_EMOJI_MODIFIER:
1299  case UCHAR_EMOJI_PRESENTATION:
1300  case UCHAR_EXTENDED_PICTOGRAPHIC:
1301  case UCHAR_EXTENDER:
1302  case UCHAR_GRAPHEME_BASE:
1303  case UCHAR_GRAPHEME_EXTEND:
1304  case UCHAR_HEX_DIGIT:
1305  case UCHAR_ID_CONTINUE:
1306  case UCHAR_ID_START:
1307  case UCHAR_IDEOGRAPHIC:
1308  case UCHAR_IDS_BINARY_OPERATOR:
1309  case UCHAR_IDS_TRINARY_OPERATOR:
1310  case UCHAR_JOIN_CONTROL:
1311  case UCHAR_LOGICAL_ORDER_EXCEPTION:
1312  case UCHAR_LOWERCASE:
1313  case UCHAR_MATH:
1314  case UCHAR_NONCHARACTER_CODE_POINT:
1315  case UCHAR_PATTERN_SYNTAX:
1316  case UCHAR_PATTERN_WHITE_SPACE:
1317  case UCHAR_QUOTATION_MARK:
1318  case UCHAR_RADICAL:
1319  case UCHAR_REGIONAL_INDICATOR:
1320  case UCHAR_S_TERM:
1321  case UCHAR_SOFT_DOTTED:
1322  case UCHAR_TERMINAL_PUNCTUATION:
1323  case UCHAR_UNIFIED_IDEOGRAPH:
1324  case UCHAR_UPPERCASE:
1325  case UCHAR_VARIATION_SELECTOR:
1326  case UCHAR_WHITE_SPACE:
1327  case UCHAR_XID_CONTINUE:
1328  case UCHAR_XID_START:
1329  return true;
1330  default:
1331  break;
1332  }
1333  return false;
1334 }
1335 
1336 bool IsUnicodePropertyValueCharacter(char c) {
1337  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1338  //
1339  // Note that using this to validate each parsed char is quite conservative.
1340  // A possible alternative solution would be to only ensure the parsed
1341  // property name/value candidate string does not contain '\0' characters and
1342  // let ICU lookups trigger the final failure.
1343  if ('a' <= c && c <= 'z') return true;
1344  if ('A' <= c && c <= 'Z') return true;
1345  if ('0' <= c && c <= '9') return true;
1346  return (c == '_');
1347 }
1348 
1349 } // anonymous namespace
1350 
1351 bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1352  std::vector<char>* name_2) {
1353  DCHECK(name_1->empty());
1354  DCHECK(name_2->empty());
1355  // Parse the property class as follows:
1356  // - In \p{name}, 'name' is interpreted
1357  // - either as a general category property value name.
1358  // - or as a binary property name.
1359  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1360  // and 'value' is interpreted as one of the available property value names.
1361  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1362  // - Loose matching is not applied.
1363  if (current() == '{') {
1364  // Parse \p{[PropertyName=]PropertyNameValue}
1365  for (Advance(); current() != '}' && current() != '='; Advance()) {
1366  if (!IsUnicodePropertyValueCharacter(current())) return false;
1367  if (!has_next()) return false;
1368  name_1->push_back(static_cast<char>(current()));
1369  }
1370  if (current() == '=') {
1371  for (Advance(); current() != '}'; Advance()) {
1372  if (!IsUnicodePropertyValueCharacter(current())) return false;
1373  if (!has_next()) return false;
1374  name_2->push_back(static_cast<char>(current()));
1375  }
1376  name_2->push_back(0); // null-terminate string.
1377  }
1378  } else {
1379  return false;
1380  }
1381  Advance();
1382  name_1->push_back(0); // null-terminate string.
1383 
1384  DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1385  DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1386  return true;
1387 }
1388 
1389 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1390  bool negate,
1391  const std::vector<char>& name_1,
1392  const std::vector<char>& name_2) {
1393  if (name_2.empty()) {
1394  // First attempt to interpret as general category property value name.
1395  const char* name = name_1.data();
1396  if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1397  add_to, zone())) {
1398  return true;
1399  }
1400  // Interpret "Any", "ASCII", and "Assigned".
1401  if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1402  return true;
1403  }
1404  // Then attempt to interpret as binary property name with value name 'Y'.
1405  UProperty property = u_getPropertyEnum(name);
1406  if (!IsSupportedBinaryProperty(property)) return false;
1407  if (!IsExactPropertyAlias(name, property)) return false;
1408  return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1409  zone());
1410  } else {
1411  // Both property name and value name are specified. Attempt to interpret
1412  // the property name as enumerated property.
1413  const char* property_name = name_1.data();
1414  const char* value_name = name_2.data();
1415  UProperty property = u_getPropertyEnum(property_name);
1416  if (!IsExactPropertyAlias(property_name, property)) return false;
1417  if (property == UCHAR_GENERAL_CATEGORY) {
1418  // We want to allow aggregate value names such as "Letter".
1419  property = UCHAR_GENERAL_CATEGORY_MASK;
1420  } else if (property != UCHAR_SCRIPT &&
1421  property != UCHAR_SCRIPT_EXTENSIONS) {
1422  return false;
1423  }
1424  return LookupPropertyValueName(property, value_name, negate, add_to,
1425  zone());
1426  }
1427 }
1428 
1429 RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
1430  if (!FLAG_harmony_regexp_sequence) return nullptr;
1431  const char* name = name_1.data();
1432  const uc32* sequence_list = nullptr;
1433  JSRegExp::Flags flags = JSRegExp::kUnicode;
1434  if (NameEquals(name, "Emoji_Flag_Sequence")) {
1435  sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1436  } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1437  sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1438  } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1439  sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1440  }
1441  if (sequence_list != nullptr) {
1442  // TODO(yangguo): this creates huge regexp code. Alternative to this is
1443  // to create a new operator that checks for these sequences at runtime.
1444  RegExpBuilder builder(zone(), flags);
1445  while (true) { // Iterate through list of sequences.
1446  while (*sequence_list != 0) { // Iterate through sequence.
1447  builder.AddUnicodeCharacter(*sequence_list);
1448  sequence_list++;
1449  }
1450  sequence_list++;
1451  if (*sequence_list == 0) break;
1452  builder.NewAlternative();
1453  }
1454  return builder.ToRegExp();
1455  }
1456 
1457  if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1458  // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1459  // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1460  RegExpBuilder builder(zone(), flags);
1461  ZoneList<CharacterRange>* prefix_ranges =
1462  new (zone()) ZoneList<CharacterRange>(2, zone());
1463  prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1464  prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1465  prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1466  builder.AddCharacterClass(
1467  new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
1468  builder.AddCharacter(0xFE0F);
1469  builder.AddCharacter(0x20E3);
1470  return builder.ToRegExp();
1471  } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1472  // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1473  // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1474  RegExpBuilder builder(zone(), flags);
1475  ZoneList<CharacterRange>* modifier_base_ranges =
1476  new (zone()) ZoneList<CharacterRange>(2, zone());
1477  LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1478  modifier_base_ranges, zone());
1479  builder.AddCharacterClass(
1480  new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
1481  ZoneList<CharacterRange>* modifier_ranges =
1482  new (zone()) ZoneList<CharacterRange>(2, zone());
1483  LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1484  zone());
1485  builder.AddCharacterClass(
1486  new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
1487  return builder.ToRegExp();
1488  }
1489 
1490  return nullptr;
1491 }
1492 
1493 #else // V8_INTL_SUPPORT
1494 
1495 bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1496  std::vector<char>* name_2) {
1497  return false;
1498 }
1499 
1500 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1501  bool negate,
1502  const std::vector<char>& name_1,
1503  const std::vector<char>& name_2) {
1504  return false;
1505 }
1506 
1507 RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
1508  return nullptr;
1509 }
1510 
1511 #endif // V8_INTL_SUPPORT
1512 
1513 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1514  uc32 x = 0;
1515  int d = HexValue(current());
1516  if (d < 0) {
1517  return false;
1518  }
1519  while (d >= 0) {
1520  x = x * 16 + d;
1521  if (x > max_value) {
1522  return false;
1523  }
1524  Advance();
1525  d = HexValue(current());
1526  }
1527  *value = x;
1528  return true;
1529 }
1530 
1531 
1532 uc32 RegExpParser::ParseClassCharacterEscape() {
1533  DCHECK_EQ('\\', current());
1534  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1535  Advance();
1536  switch (current()) {
1537  case 'b':
1538  Advance();
1539  return '\b';
1540  // ControlEscape :: one of
1541  // f n r t v
1542  case 'f':
1543  Advance();
1544  return '\f';
1545  case 'n':
1546  Advance();
1547  return '\n';
1548  case 'r':
1549  Advance();
1550  return '\r';
1551  case 't':
1552  Advance();
1553  return '\t';
1554  case 'v':
1555  Advance();
1556  return '\v';
1557  case 'c': {
1558  uc32 controlLetter = Next();
1559  uc32 letter = controlLetter & ~('A' ^ 'a');
1560  // Inside a character class, we also accept digits and underscore as
1561  // control characters, unless with /u. See Annex B:
1562  // ES#prod-annexB-ClassControlLetter
1563  if (letter >= 'A' && letter <= 'Z') {
1564  Advance(2);
1565  // Control letters mapped to ASCII control characters in the range
1566  // 0x00-0x1F.
1567  return controlLetter & 0x1F;
1568  }
1569  if (unicode()) {
1570  // With /u, invalid escapes are not treated as identity escapes.
1571  ReportError(CStrVector("Invalid class escape"));
1572  return 0;
1573  }
1574  if ((controlLetter >= '0' && controlLetter <= '9') ||
1575  controlLetter == '_') {
1576  Advance(2);
1577  return controlLetter & 0x1F;
1578  }
1579  // We match JSC in reading the backslash as a literal
1580  // character instead of as starting an escape.
1581  // TODO(v8:6201): Not yet covered by the spec.
1582  return '\\';
1583  }
1584  case '0':
1585  // With /u, \0 is interpreted as NUL if not followed by another digit.
1586  if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1587  Advance();
1588  return 0;
1589  }
1590  V8_FALLTHROUGH;
1591  case '1':
1592  case '2':
1593  case '3':
1594  case '4':
1595  case '5':
1596  case '6':
1597  case '7':
1598  // For compatibility, we interpret a decimal escape that isn't
1599  // a back reference (and therefore either \0 or not valid according
1600  // to the specification) as a 1..3 digit octal character code.
1601  // ES#prod-annexB-LegacyOctalEscapeSequence
1602  if (unicode()) {
1603  // With /u, decimal escape is not interpreted as octal character code.
1604  ReportError(CStrVector("Invalid class escape"));
1605  return 0;
1606  }
1607  return ParseOctalLiteral();
1608  case 'x': {
1609  Advance();
1610  uc32 value;
1611  if (ParseHexEscape(2, &value)) return value;
1612  if (unicode()) {
1613  // With /u, invalid escapes are not treated as identity escapes.
1614  ReportError(CStrVector("Invalid escape"));
1615  return 0;
1616  }
1617  // If \x is not followed by a two-digit hexadecimal, treat it
1618  // as an identity escape.
1619  return 'x';
1620  }
1621  case 'u': {
1622  Advance();
1623  uc32 value;
1624  if (ParseUnicodeEscape(&value)) return value;
1625  if (unicode()) {
1626  // With /u, invalid escapes are not treated as identity escapes.
1627  ReportError(CStrVector("Invalid unicode escape"));
1628  return 0;
1629  }
1630  // If \u is not followed by a two-digit hexadecimal, treat it
1631  // as an identity escape.
1632  return 'u';
1633  }
1634  default: {
1635  uc32 result = current();
1636  // With /u, no identity escapes except for syntax characters and '-' are
1637  // allowed. Otherwise, all identity escapes are allowed.
1638  if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1639  Advance();
1640  return result;
1641  }
1642  ReportError(CStrVector("Invalid escape"));
1643  return 0;
1644  }
1645  }
1646  return 0;
1647 }
1648 
1649 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1650  Zone* zone,
1651  bool add_unicode_case_equivalents,
1652  uc32* char_out, bool* is_class_escape) {
1653  uc32 current_char = current();
1654  if (current_char == '\\') {
1655  switch (Next()) {
1656  case 'w':
1657  case 'W':
1658  case 'd':
1659  case 'D':
1660  case 's':
1661  case 'S': {
1662  CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1663  add_unicode_case_equivalents, zone);
1664  Advance(2);
1665  *is_class_escape = true;
1666  return;
1667  }
1668  case kEndMarker:
1669  ReportError(CStrVector("\\ at end of pattern"));
1670  return;
1671  case 'p':
1672  case 'P':
1673  if (unicode()) {
1674  bool negate = Next() == 'P';
1675  Advance(2);
1676  std::vector<char> name_1, name_2;
1677  if (!ParsePropertyClassName(&name_1, &name_2) ||
1678  !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1679  ReportError(CStrVector("Invalid property name in character class"));
1680  }
1681  *is_class_escape = true;
1682  return;
1683  }
1684  break;
1685  default:
1686  break;
1687  }
1688  *char_out = ParseClassCharacterEscape();
1689  *is_class_escape = false;
1690  } else {
1691  Advance();
1692  *char_out = current_char;
1693  *is_class_escape = false;
1694  }
1695 }
1696 
1697 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1698  static const char* kUnterminated = "Unterminated character class";
1699  static const char* kRangeInvalid = "Invalid character class";
1700  static const char* kRangeOutOfOrder = "Range out of order in character class";
1701 
1702  DCHECK_EQ(current(), '[');
1703  Advance();
1704  bool is_negated = false;
1705  if (current() == '^') {
1706  is_negated = true;
1707  Advance();
1708  }
1709  ZoneList<CharacterRange>* ranges =
1710  new (zone()) ZoneList<CharacterRange>(2, zone());
1711  bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1712  while (has_more() && current() != ']') {
1713  uc32 char_1, char_2;
1714  bool is_class_1, is_class_2;
1715  ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1716  &is_class_1 CHECK_FAILED);
1717  if (current() == '-') {
1718  Advance();
1719  if (current() == kEndMarker) {
1720  // If we reach the end we break out of the loop and let the
1721  // following code report an error.
1722  break;
1723  } else if (current() == ']') {
1724  if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1725  ranges->Add(CharacterRange::Singleton('-'), zone());
1726  break;
1727  }
1728  ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1729  &is_class_2 CHECK_FAILED);
1730  if (is_class_1 || is_class_2) {
1731  // Either end is an escaped character class. Treat the '-' verbatim.
1732  if (unicode()) {
1733  // ES2015 21.2.2.15.1 step 1.
1734  return ReportError(CStrVector(kRangeInvalid));
1735  }
1736  if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1737  ranges->Add(CharacterRange::Singleton('-'), zone());
1738  if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1739  continue;
1740  }
1741  // ES2015 21.2.2.15.1 step 6.
1742  if (char_1 > char_2) {
1743  return ReportError(CStrVector(kRangeOutOfOrder));
1744  }
1745  ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1746  } else {
1747  if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1748  }
1749  }
1750  if (!has_more()) {
1751  return ReportError(CStrVector(kUnterminated));
1752  }
1753  Advance();
1754  RegExpCharacterClass::CharacterClassFlags character_class_flags;
1755  if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1756  return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1757  character_class_flags);
1758 }
1759 
1760 
1761 #undef CHECK_FAILED
1762 
1763 
1764 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1765  FlatStringReader* input, JSRegExp::Flags flags,
1766  RegExpCompileData* result) {
1767  DCHECK(result != nullptr);
1768  RegExpParser parser(input, &result->error, flags, isolate, zone);
1769  RegExpTree* tree = parser.ParsePattern();
1770  if (parser.failed()) {
1771  DCHECK(tree == nullptr);
1772  DCHECK(!result->error.is_null());
1773  } else {
1774  DCHECK(tree != nullptr);
1775  DCHECK(result->error.is_null());
1776  if (FLAG_trace_regexp_parser) {
1777  StdoutStream os;
1778  tree->Print(os, zone);
1779  os << "\n";
1780  }
1781  result->tree = tree;
1782  int capture_count = parser.captures_started();
1783  result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1784  result->contains_anchor = parser.contains_anchor();
1785  result->capture_name_map = parser.CreateCaptureNameMap();
1786  result->capture_count = capture_count;
1787  }
1788  return !parser.failed();
1789 }
1790 
1791 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1792  : zone_(zone),
1793  pending_empty_(false),
1794  flags_(flags),
1795  characters_(nullptr),
1796  pending_surrogate_(kNoPendingSurrogate),
1797  terms_(),
1798  alternatives_()
1799 #ifdef DEBUG
1800  ,
1801  last_added_(ADD_NONE)
1802 #endif
1803 {
1804 }
1805 
1806 
1807 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1808  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1809  FlushPendingSurrogate();
1810  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1811  pending_surrogate_ = lead_surrogate;
1812 }
1813 
1814 
1815 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1816  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1817  if (pending_surrogate_ != kNoPendingSurrogate) {
1818  uc16 lead_surrogate = pending_surrogate_;
1819  pending_surrogate_ = kNoPendingSurrogate;
1820  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1821  uc32 combined =
1822  unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1823  if (NeedsDesugaringForIgnoreCase(combined)) {
1824  AddCharacterClassForDesugaring(combined);
1825  } else {
1826  ZoneList<uc16> surrogate_pair(2, zone());
1827  surrogate_pair.Add(lead_surrogate, zone());
1828  surrogate_pair.Add(trail_surrogate, zone());
1829  RegExpAtom* atom =
1830  new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1831  AddAtom(atom);
1832  }
1833  } else {
1834  pending_surrogate_ = trail_surrogate;
1835  FlushPendingSurrogate();
1836  }
1837 }
1838 
1839 
1840 void RegExpBuilder::FlushPendingSurrogate() {
1841  if (pending_surrogate_ != kNoPendingSurrogate) {
1842  DCHECK(unicode());
1843  uc32 c = pending_surrogate_;
1844  pending_surrogate_ = kNoPendingSurrogate;
1845  AddCharacterClassForDesugaring(c);
1846  }
1847 }
1848 
1849 
1850 void RegExpBuilder::FlushCharacters() {
1851  FlushPendingSurrogate();
1852  pending_empty_ = false;
1853  if (characters_ != nullptr) {
1854  RegExpTree* atom =
1855  new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1856  characters_ = nullptr;
1857  text_.Add(atom, zone());
1858  LAST(ADD_ATOM);
1859  }
1860 }
1861 
1862 
1863 void RegExpBuilder::FlushText() {
1864  FlushCharacters();
1865  int num_text = text_.length();
1866  if (num_text == 0) {
1867  return;
1868  } else if (num_text == 1) {
1869  terms_.Add(text_.last(), zone());
1870  } else {
1871  RegExpText* text = new (zone()) RegExpText(zone());
1872  for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1873  terms_.Add(text, zone());
1874  }
1875  text_.Clear();
1876 }
1877 
1878 
1879 void RegExpBuilder::AddCharacter(uc16 c) {
1880  FlushPendingSurrogate();
1881  pending_empty_ = false;
1882  if (NeedsDesugaringForIgnoreCase(c)) {
1883  AddCharacterClassForDesugaring(c);
1884  } else {
1885  if (characters_ == nullptr) {
1886  characters_ = new (zone()) ZoneList<uc16>(4, zone());
1887  }
1888  characters_->Add(c, zone());
1889  LAST(ADD_CHAR);
1890  }
1891 }
1892 
1893 
1894 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1895  if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1896  DCHECK(unicode());
1897  AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1898  AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1899  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1900  AddLeadSurrogate(c);
1901  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1902  AddTrailSurrogate(c);
1903  } else {
1904  AddCharacter(static_cast<uc16>(c));
1905  }
1906 }
1907 
1908 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1909  // A lead or trail surrogate parsed via escape sequence will not
1910  // pair up with any preceding lead or following trail surrogate.
1911  FlushPendingSurrogate();
1912  AddUnicodeCharacter(character);
1913  FlushPendingSurrogate();
1914 }
1915 
1916 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1917 
1918 
1919 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1920  if (NeedsDesugaringForUnicode(cc)) {
1921  // With /u, character class needs to be desugared, so it
1922  // must be a standalone term instead of being part of a RegExpText.
1923  AddTerm(cc);
1924  } else {
1925  AddAtom(cc);
1926  }
1927 }
1928 
1929 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1930  AddTerm(new (zone()) RegExpCharacterClass(
1931  zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1932  flags_));
1933 }
1934 
1935 
1936 void RegExpBuilder::AddAtom(RegExpTree* term) {
1937  if (term->IsEmpty()) {
1938  AddEmpty();
1939  return;
1940  }
1941  if (term->IsTextElement()) {
1942  FlushCharacters();
1943  text_.Add(term, zone());
1944  } else {
1945  FlushText();
1946  terms_.Add(term, zone());
1947  }
1948  LAST(ADD_ATOM);
1949 }
1950 
1951 
1952 void RegExpBuilder::AddTerm(RegExpTree* term) {
1953  FlushText();
1954  terms_.Add(term, zone());
1955  LAST(ADD_ATOM);
1956 }
1957 
1958 
1959 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1960  FlushText();
1961  if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1962  // Omit repeated assertions of the same type.
1963  RegExpAssertion* last = terms_.last()->AsAssertion();
1964  RegExpAssertion* next = assert->AsAssertion();
1965  if (last->assertion_type() == next->assertion_type()) return;
1966  }
1967  terms_.Add(assert, zone());
1968  LAST(ADD_ASSERT);
1969 }
1970 
1971 
1972 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1973 
1974 
1975 void RegExpBuilder::FlushTerms() {
1976  FlushText();
1977  int num_terms = terms_.length();
1978  RegExpTree* alternative;
1979  if (num_terms == 0) {
1980  alternative = new (zone()) RegExpEmpty();
1981  } else if (num_terms == 1) {
1982  alternative = terms_.last();
1983  } else {
1984  alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1985  }
1986  alternatives_.Add(alternative, zone());
1987  terms_.Clear();
1988  LAST(ADD_NONE);
1989 }
1990 
1991 
1992 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1993  if (!unicode()) return false;
1994  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1995  // necessarily mean that we need to desugar. It's probably nicer to have a
1996  // separate pass to figure out unicode desugarings.
1997  if (ignore_case()) return true;
1998  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1999  CharacterRange::Canonicalize(ranges);
2000  for (int i = ranges->length() - 1; i >= 0; i--) {
2001  uc32 from = ranges->at(i).from();
2002  uc32 to = ranges->at(i).to();
2003  // Check for non-BMP characters.
2004  if (to >= kNonBmpStart) return true;
2005  // Check for lone surrogates.
2006  if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2007  }
2008  return false;
2009 }
2010 
2011 
2012 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2013 #ifdef V8_INTL_SUPPORT
2014  if (unicode() && ignore_case()) {
2015  icu::UnicodeSet set(c, c);
2016  set.closeOver(USET_CASE_INSENSITIVE);
2017  set.removeAllStrings();
2018  return set.size() > 1;
2019  }
2020  // In the case where ICU is not included, we act as if the unicode flag is
2021  // not set, and do not desugar.
2022 #endif // V8_INTL_SUPPORT
2023  return false;
2024 }
2025 
2026 
2027 RegExpTree* RegExpBuilder::ToRegExp() {
2028  FlushTerms();
2029  int num_alternatives = alternatives_.length();
2030  if (num_alternatives == 0) return new (zone()) RegExpEmpty();
2031  if (num_alternatives == 1) return alternatives_.last();
2032  return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
2033 }
2034 
2035 bool RegExpBuilder::AddQuantifierToAtom(
2036  int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2037  FlushPendingSurrogate();
2038  if (pending_empty_) {
2039  pending_empty_ = false;
2040  return true;
2041  }
2042  RegExpTree* atom;
2043  if (characters_ != nullptr) {
2044  DCHECK(last_added_ == ADD_CHAR);
2045  // Last atom was character.
2046  Vector<const uc16> char_vector = characters_->ToConstVector();
2047  int num_chars = char_vector.length();
2048  if (num_chars > 1) {
2049  Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2050  text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
2051  char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2052  }
2053  characters_ = nullptr;
2054  atom = new (zone()) RegExpAtom(char_vector, flags_);
2055  FlushText();
2056  } else if (text_.length() > 0) {
2057  DCHECK(last_added_ == ADD_ATOM);
2058  atom = text_.RemoveLast();
2059  FlushText();
2060  } else if (terms_.length() > 0) {
2061  DCHECK(last_added_ == ADD_ATOM);
2062  atom = terms_.RemoveLast();
2063  if (atom->IsLookaround()) {
2064  // With /u, lookarounds are not quantifiable.
2065  if (unicode()) return false;
2066  // Lookbehinds are not quantifiable.
2067  if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2068  return false;
2069  }
2070  }
2071  if (atom->max_match() == 0) {
2072  // Guaranteed to only match an empty string.
2073  LAST(ADD_TERM);
2074  if (min == 0) {
2075  return true;
2076  }
2077  terms_.Add(atom, zone());
2078  return true;
2079  }
2080  } else {
2081  // Only call immediately after adding an atom or character!
2082  UNREACHABLE();
2083  }
2084  terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2085  zone());
2086  LAST(ADD_TERM);
2087  return true;
2088 }
2089 
2090 } // namespace internal
2091 } // namespace v8
Definition: libplatform.h:13