5 #include "src/regexp/regexp-parser.h" 9 #include "src/char-predicates-inl.h" 10 #include "src/heap/factory.h" 11 #include "src/isolate.h" 12 #include "src/objects-inl.h" 13 #include "src/ostreams.h" 14 #include "src/regexp/jsregexp.h" 15 #include "src/regexp/property-sequences.h" 16 #include "src/utils.h" 18 #ifdef V8_INTL_SUPPORT 19 #include "unicode/uniset.h" 20 #endif // V8_INTL_SUPPORT 25 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
26 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
31 named_captures_(nullptr),
32 named_back_references_(nullptr),
35 top_level_flags_(flags),
41 contains_anchor_(false),
42 is_scanned_for_captures_(false),
43 has_named_captures_(false),
48 template <
bool update_position>
49 inline uc32 RegExpParser::ReadNext() {
50 int position = next_pos_;
51 uc32 c0 = in()->Get(position);
54 if (unicode() && position < in()->length() &&
55 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
56 uc16 c1 = in()->Get(position);
57 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
58 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
62 if (update_position) next_pos_ = position;
67 uc32 RegExpParser::Next() {
69 return ReadNext<false>();
75 void RegExpParser::Advance() {
77 StackLimitCheck check(isolate());
78 if (check.HasOverflowed()) {
79 if (FLAG_abort_on_stack_or_string_length_overflow) {
80 FATAL(
"Aborting on stack overflow");
82 ReportError(CStrVector(
83 MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
84 }
else if (zone()->excess_allocation()) {
85 ReportError(CStrVector(
"Regular expression too large"));
87 current_ = ReadNext<true>();
90 current_ = kEndMarker;
93 next_pos_ = in()->length() + 1;
99 void RegExpParser::Reset(
int pos) {
101 has_more_ = (pos < in()->length());
105 void RegExpParser::Advance(
int dist) {
106 next_pos_ += dist - 1;
111 bool RegExpParser::simple() {
return simple_; }
113 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
138 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
139 if (failed_)
return nullptr;
143 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
146 current_ = kEndMarker;
147 next_pos_ = in()->length();
151 #define CHECK_FAILED ); \ 152 if (failed_) return nullptr; \ 157 RegExpTree* RegExpParser::ParsePattern() {
158 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
159 PatchNamedBackReferences(CHECK_FAILED);
163 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
180 RegExpTree* RegExpParser::ParseDisjunction() {
182 RegExpParserState initial_state(
nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
183 0,
nullptr, top_level_flags_, zone());
184 RegExpParserState* state = &initial_state;
186 RegExpBuilder* builder = initial_state.builder();
190 if (state->IsSubexpression()) {
192 return ReportError(CStrVector(
"Unterminated group"));
194 DCHECK_EQ(INITIAL, state->group_type());
196 return builder->ToRegExp();
198 if (!state->IsSubexpression()) {
199 return ReportError(CStrVector(
"Unmatched ')'"));
201 DCHECK_NE(INITIAL, state->group_type());
206 RegExpTree* body = builder->ToRegExp();
208 int end_capture_index = captures_started();
210 int capture_index = state->capture_index();
211 SubexpressionType group_type = state->group_type();
214 if (group_type == CAPTURE) {
215 if (state->IsNamedCapture()) {
216 CreateNamedCaptureAtIndex(state->capture_name(),
217 capture_index CHECK_FAILED);
219 RegExpCapture* capture = GetCapture(capture_index);
220 capture->set_body(body);
222 }
else if (group_type == GROUPING) {
223 body =
new (zone()) RegExpGroup(body);
225 DCHECK(group_type == POSITIVE_LOOKAROUND ||
226 group_type == NEGATIVE_LOOKAROUND);
227 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
228 body =
new (zone()) RegExpLookaround(
229 body, is_positive, end_capture_index - capture_index,
230 capture_index, state->lookaround_type());
234 state = state->previous_state();
235 builder = state->builder();
237 builder->AddAtom(body);
244 builder->NewAlternative();
250 return ReportError(CStrVector(
"Nothing to repeat"));
253 if (builder->multiline()) {
254 builder->AddAssertion(
new (zone()) RegExpAssertion(
255 RegExpAssertion::START_OF_LINE, builder->flags()));
257 builder->AddAssertion(
new (zone()) RegExpAssertion(
258 RegExpAssertion::START_OF_INPUT, builder->flags()));
259 set_contains_anchor();
265 RegExpAssertion::AssertionType assertion_type =
266 builder->multiline() ? RegExpAssertion::END_OF_LINE
267 : RegExpAssertion::END_OF_INPUT;
268 builder->AddAssertion(
269 new (zone()) RegExpAssertion(assertion_type, builder->flags()));
274 ZoneList<CharacterRange>* ranges =
275 new (zone()) ZoneList<CharacterRange>(2, zone());
277 if (builder->dotall()) {
279 CharacterRange::AddClassEscape(
'*', ranges,
false, zone());
282 CharacterRange::AddClassEscape(
'.', ranges,
false, zone());
285 RegExpCharacterClass* cc =
286 new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
287 builder->AddCharacterClass(cc);
291 state = ParseOpenParenthesis(state CHECK_FAILED);
292 builder = state->builder();
296 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
297 builder->AddCharacterClass(cc->AsCharacterClass());
305 return ReportError(CStrVector(
"\\ at end of pattern"));
308 builder->AddAssertion(
new (zone()) RegExpAssertion(
309 RegExpAssertion::BOUNDARY, builder->flags()));
313 builder->AddAssertion(
new (zone()) RegExpAssertion(
314 RegExpAssertion::NON_BOUNDARY, builder->flags()));
329 ZoneList<CharacterRange>* ranges =
330 new (zone()) ZoneList<CharacterRange>(2, zone());
331 CharacterRange::AddClassEscape(
332 c, ranges, unicode() && builder->ignore_case(), zone());
333 RegExpCharacterClass* cc =
new (zone())
334 RegExpCharacterClass(zone(), ranges, builder->flags());
335 builder->AddCharacterClass(cc);
343 ZoneList<CharacterRange>* ranges =
344 new (zone()) ZoneList<CharacterRange>(2, zone());
345 std::vector<char> name_1, name_2;
346 if (ParsePropertyClassName(&name_1, &name_2)) {
347 if (AddPropertyClassRange(ranges, p ==
'P', name_1, name_2)) {
348 RegExpCharacterClass* cc =
new (zone())
349 RegExpCharacterClass(zone(), ranges, builder->flags());
350 builder->AddCharacterClass(cc);
353 if (p ==
'p' && name_2.empty()) {
354 RegExpTree* sequence = GetPropertySequence(name_1);
355 if (sequence !=
nullptr) {
356 builder->AddAtom(sequence);
361 return ReportError(CStrVector(
"Invalid property name"));
363 builder->AddCharacter(p);
377 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
379 if (state->IsInsideCaptureGroup(index)) {
387 RegExpCapture* capture = GetCapture(index);
389 new (zone()) RegExpBackReference(capture, builder->flags());
390 builder->AddAtom(atom);
397 return ReportError(CStrVector(
"Invalid escape"));
399 uc32 first_digit = Next();
400 if (first_digit ==
'8' || first_digit ==
'9') {
401 builder->AddCharacter(first_digit);
409 if (unicode() && Next() >=
'0' && Next() <=
'9') {
411 return ReportError(CStrVector(
"Invalid decimal escape"));
413 uc32 octal = ParseOctalLiteral();
414 builder->AddCharacter(octal);
421 builder->AddCharacter(
'\f');
425 builder->AddCharacter(
'\n');
429 builder->AddCharacter(
'\r');
433 builder->AddCharacter(
'\t');
437 builder->AddCharacter(
'\v');
441 uc32 controlLetter = Next();
444 uc32 letter = controlLetter & ~(
'a' ^
'A');
445 if (letter <
'A' ||
'Z' < letter) {
452 return ReportError(CStrVector(
"Invalid unicode escape"));
454 builder->AddCharacter(
'\\');
457 builder->AddCharacter(controlLetter & 0x1F);
464 if (ParseHexEscape(2, &value)) {
465 builder->AddCharacter(value);
466 }
else if (!unicode()) {
467 builder->AddCharacter(
'x');
470 return ReportError(CStrVector(
"Invalid escape"));
477 if (ParseUnicodeEscape(&value)) {
478 builder->AddEscapedUnicodeCharacter(value);
479 }
else if (!unicode()) {
480 builder->AddCharacter(
'u');
483 return ReportError(CStrVector(
"Invalid Unicode escape"));
493 if (unicode() || HasNamedCaptures()) {
495 ParseNamedBackReference(builder, state CHECK_FAILED);
503 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
504 builder->AddCharacter(current());
507 return ReportError(CStrVector(
"Invalid escape"));
514 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
515 if (parsed)
return ReportError(CStrVector(
"Nothing to repeat"));
521 return ReportError(CStrVector(
"Lone quantifier brackets"));
525 builder->AddUnicodeCharacter(current());
540 max = RegExpTree::kInfinity;
545 max = RegExpTree::kInfinity;
554 if (ParseIntervalQuantifier(&min, &max)) {
557 CStrVector(
"numbers out of order in {} quantifier"));
560 }
else if (unicode()) {
562 return ReportError(CStrVector(
"Incomplete quantifier"));
568 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
569 if (current() ==
'?') {
570 quantifier_type = RegExpQuantifier::NON_GREEDY;
572 }
else if (FLAG_regexp_possessive_quantifier && current() ==
'+') {
574 quantifier_type = RegExpQuantifier::POSSESSIVE;
577 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
578 return ReportError(CStrVector(
"Invalid quantifier"));
583 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
584 RegExpParserState* state) {
585 RegExpLookaround::Type lookaround_type = state->lookaround_type();
586 bool is_named_capture =
false;
587 JSRegExp::Flags switch_on = JSRegExp::kNone;
588 JSRegExp::Flags switch_off = JSRegExp::kNone;
589 const ZoneVector<uc16>* capture_name =
nullptr;
590 SubexpressionType subexpr_type = CAPTURE;
592 if (current() ==
'?') {
596 subexpr_type = GROUPING;
600 lookaround_type = RegExpLookaround::LOOKAHEAD;
601 subexpr_type = POSITIVE_LOOKAROUND;
605 lookaround_type = RegExpLookaround::LOOKAHEAD;
606 subexpr_type = NEGATIVE_LOOKAROUND;
612 if (!FLAG_regexp_mode_modifiers) {
613 ReportError(CStrVector(
"Invalid group"));
617 bool flags_sense =
true;
618 while (subexpr_type != GROUPING) {
622 ReportError(CStrVector(
"Multiple dashes in flag group"));
631 JSRegExp::Flags bit = JSRegExp::kUnicode;
632 if (current() ==
'i') bit = JSRegExp::kIgnoreCase;
633 if (current() ==
'm') bit = JSRegExp::kMultiline;
634 if (current() ==
's') bit = JSRegExp::kDotAll;
635 if (((switch_on | switch_off) & bit) != 0) {
636 ReportError(CStrVector(
"Repeated flag in flag group"));
654 JSRegExp::Flags flags =
655 (state->builder()->flags() | switch_on) & ~switch_off;
656 state->builder()->set_flags(flags);
661 subexpr_type = GROUPING;
664 ReportError(CStrVector(
"Invalid flag group"));
674 lookaround_type = RegExpLookaround::LOOKBEHIND;
675 subexpr_type = POSITIVE_LOOKAROUND;
677 }
else if (Next() ==
'!') {
679 lookaround_type = RegExpLookaround::LOOKBEHIND;
680 subexpr_type = NEGATIVE_LOOKAROUND;
683 is_named_capture =
true;
684 has_named_captures_ =
true;
688 ReportError(CStrVector(
"Invalid group"));
692 if (subexpr_type == CAPTURE) {
693 if (captures_started_ >= kMaxCaptures) {
694 ReportError(CStrVector(
"Too many captures"));
699 if (is_named_capture) {
700 capture_name = ParseCaptureGroupName(CHECK_FAILED);
703 JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
706 RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
707 capture_name, flags, zone());
712 static bool IsSpecialClassEscape(uc32 c) {
734 void RegExpParser::ScanForCaptures() {
735 DCHECK(!is_scanned_for_captures_);
736 const int saved_position = position();
738 int capture_count = captures_started();
741 while ((n = current()) != kEndMarker) {
749 while ((c = current()) != kEndMarker) {
760 if (current() ==
'?') {
769 if (current() !=
'<')
break;
772 if (current() ==
'=' || current() ==
'!')
break;
777 has_named_captures_ =
true;
783 capture_count_ = capture_count;
784 is_scanned_for_captures_ =
true;
785 Reset(saved_position);
789 bool RegExpParser::ParseBackReferenceIndex(
int* index_out) {
790 DCHECK_EQ(
'\\', current());
791 DCHECK(
'1' <= Next() && Next() <=
'9');
794 int start = position();
795 int value = Next() -
'0';
799 if (IsDecimalDigit(c)) {
800 value = 10 * value + (c -
'0');
801 if (value > kMaxCaptures) {
810 if (value > captures_started()) {
811 if (!is_scanned_for_captures_) ScanForCaptures();
812 if (value > capture_count_) {
821 static void push_code_unit(ZoneVector<uc16>* v,
uint32_t code_unit) {
822 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
823 v->push_back(code_unit);
825 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
826 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
830 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
831 ZoneVector<uc16>* name =
832 new (zone()->New(
sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
834 bool at_start =
true;
840 if (c ==
'\\' && current() ==
'u') {
842 if (!ParseUnicodeEscape(&c)) {
843 ReportError(CStrVector(
"Invalid Unicode escape sequence"));
850 ReportError(CStrVector(
"Invalid capture group name"));
855 if (!IsIdentifierStart(c)) {
856 ReportError(CStrVector(
"Invalid capture group name"));
859 push_code_unit(name, c);
864 }
else if (IsIdentifierPart(c)) {
865 push_code_unit(name, c);
867 ReportError(CStrVector(
"Invalid capture group name"));
876 bool RegExpParser::CreateNamedCaptureAtIndex(
const ZoneVector<uc16>* name,
878 DCHECK(0 < index && index <= captures_started_);
879 DCHECK_NOT_NULL(name);
881 if (named_captures_ ==
nullptr) {
882 named_captures_ =
new (zone()) ZoneList<RegExpCapture*>(1, zone());
886 for (
const auto& named_capture : *named_captures_) {
887 if (*named_capture->name() == *name) {
888 ReportError(CStrVector(
"Duplicate capture group name"));
894 RegExpCapture* capture = GetCapture(index);
895 DCHECK_NULL(capture->name());
897 capture->set_name(name);
898 named_captures_->Add(capture, zone());
903 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
904 RegExpParserState* state) {
906 if (current() !=
'<') {
907 ReportError(CStrVector(
"Invalid named reference"));
912 const ZoneVector<uc16>* name = ParseCaptureGroupName();
913 if (name ==
nullptr) {
917 if (state->IsInsideCaptureGroup(name)) {
920 RegExpBackReference* atom =
921 new (zone()) RegExpBackReference(builder->flags());
922 atom->set_name(name);
924 builder->AddAtom(atom);
926 if (named_back_references_ ==
nullptr) {
927 named_back_references_ =
928 new (zone()) ZoneList<RegExpBackReference*>(1, zone());
930 named_back_references_->Add(atom, zone());
936 void RegExpParser::PatchNamedBackReferences() {
937 if (named_back_references_ ==
nullptr)
return;
939 if (named_captures_ ==
nullptr) {
940 ReportError(CStrVector(
"Invalid named capture referenced"));
947 for (
int i = 0;
i < named_back_references_->length();
i++) {
948 RegExpBackReference* ref = named_back_references_->at(
i);
951 for (
const auto& capture : *named_captures_) {
952 if (*capture->name() == *ref->name()) {
953 index = capture->index();
959 ReportError(CStrVector(
"Invalid named capture referenced"));
963 ref->set_capture(GetCapture(index));
967 RegExpCapture* RegExpParser::GetCapture(
int index) {
971 is_scanned_for_captures_ ? capture_count_ : captures_started_;
972 DCHECK(index <= know_captures);
973 if (captures_ ==
nullptr) {
974 captures_ =
new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
976 while (captures_->length() < know_captures) {
977 captures_->Add(
new (zone()) RegExpCapture(captures_->length() + 1), zone());
979 return captures_->at(index - 1);
982 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
983 if (named_captures_ ==
nullptr || named_captures_->is_empty())
984 return Handle<FixedArray>();
986 Factory* factory = isolate()->factory();
988 int len = named_captures_->length() * 2;
989 Handle<FixedArray> array = factory->NewFixedArray(len);
991 for (
int i = 0;
i < named_captures_->length();
i++) {
992 RegExpCapture* capture = named_captures_->at(
i);
993 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
994 array->set(
i * 2, *name.ToHandleChecked());
995 array->set(
i * 2 + 1, Smi::FromInt(capture->index()));
1001 bool RegExpParser::HasNamedCaptures() {
1002 if (has_named_captures_ || is_scanned_for_captures_) {
1003 return has_named_captures_;
1007 DCHECK(is_scanned_for_captures_);
1008 return has_named_captures_;
1011 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
int index) {
1012 for (RegExpParserState* s =
this; s !=
nullptr; s = s->previous_state()) {
1013 if (s->group_type() != CAPTURE)
continue;
1015 if (index == s->capture_index())
return true;
1017 if (index > s->capture_index())
return false;
1022 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1023 const ZoneVector<uc16>* name) {
1024 DCHECK_NOT_NULL(name);
1025 for (RegExpParserState* s =
this; s !=
nullptr; s = s->previous_state()) {
1026 if (s->capture_name() ==
nullptr)
continue;
1027 if (*s->capture_name() == *name)
return true;
1039 bool RegExpParser::ParseIntervalQuantifier(
int* min_out,
int* max_out) {
1040 DCHECK_EQ(current(),
'{');
1041 int start = position();
1044 if (!IsDecimalDigit(current())) {
1048 while (IsDecimalDigit(current())) {
1049 int next = current() -
'0';
1050 if (min > (RegExpTree::kInfinity - next) / 10) {
1054 }
while (IsDecimalDigit(current()));
1055 min = RegExpTree::kInfinity;
1058 min = 10 * min + next;
1062 if (current() ==
'}') {
1065 }
else if (current() ==
',') {
1067 if (current() ==
'}') {
1068 max = RegExpTree::kInfinity;
1071 while (IsDecimalDigit(current())) {
1072 int next = current() -
'0';
1073 if (max > (RegExpTree::kInfinity - next) / 10) {
1076 }
while (IsDecimalDigit(current()));
1077 max = RegExpTree::kInfinity;
1080 max = 10 * max + next;
1083 if (current() !=
'}') {
1099 uc32 RegExpParser::ParseOctalLiteral() {
1100 DCHECK((
'0' <= current() && current() <=
'7') || current() == kEndMarker);
1104 uc32 value = current() -
'0';
1106 if (
'0' <= current() && current() <=
'7') {
1107 value = value * 8 + current() -
'0';
1109 if (value < 32 &&
'0' <= current() && current() <=
'7') {
1110 value = value * 8 + current() -
'0';
1118 bool RegExpParser::ParseHexEscape(
int length, uc32* value) {
1119 int start = position();
1121 for (
int i = 0;
i < length; ++
i) {
1123 int d = HexValue(c);
1136 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1140 if (current() ==
'{' && unicode()) {
1141 int start = position();
1143 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1144 if (current() ==
'}') {
1153 bool result = ParseHexEscape(4, value);
1154 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1155 current() ==
'\\') {
1157 int start = position();
1158 if (Next() ==
'u') {
1161 if (ParseHexEscape(4, &trail) &&
1162 unibrow::Utf16::IsTrailSurrogate(trail)) {
1163 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1164 static_cast<uc16>(trail));
1173 #ifdef V8_INTL_SUPPORT 1177 bool IsExactPropertyAlias(
const char* property_name, UProperty property) {
1178 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1179 if (short_name !=
nullptr && strcmp(property_name, short_name) == 0)
1181 for (
int i = 0;;
i++) {
1182 const char* long_name = u_getPropertyName(
1183 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME +
i));
1184 if (long_name ==
nullptr)
break;
1185 if (strcmp(property_name, long_name) == 0)
return true;
1190 bool IsExactPropertyValueAlias(
const char* property_value_name,
1191 UProperty property, int32_t property_value) {
1192 const char* short_name =
1193 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1194 if (short_name !=
nullptr && strcmp(property_value_name, short_name) == 0) {
1197 for (
int i = 0;;
i++) {
1198 const char* long_name = u_getPropertyValueName(
1199 property, property_value,
1200 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME +
i));
1201 if (long_name ==
nullptr)
break;
1202 if (strcmp(property_value_name, long_name) == 0)
return true;
1207 bool LookupPropertyValueName(UProperty property,
1208 const char* property_value_name,
bool negate,
1209 ZoneList<CharacterRange>* result, Zone* zone) {
1210 UProperty property_for_lookup = property;
1211 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1214 property_for_lookup = UCHAR_SCRIPT;
1216 int32_t property_value =
1217 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1218 if (property_value == UCHAR_INVALID_CODE)
return false;
1222 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1227 UErrorCode ec = U_ZERO_ERROR;
1228 icu::UnicodeSet
set;
1229 set.applyIntPropertyValue(property, property_value, ec);
1230 bool success = ec == U_ZERO_ERROR && !
set.isEmpty();
1233 set.removeAllStrings();
1234 if (negate)
set.complement();
1235 for (
int i = 0;
i <
set.getRangeCount();
i++) {
1237 CharacterRange::Range(
set.getRangeStart(
i),
set.getRangeEnd(
i)),
1245 inline bool NameEquals(
const char* name,
const char (&literal)[N]) {
1246 return strncmp(name, literal, N + 1) == 0;
1249 bool LookupSpecialPropertyValueName(
const char* name,
1250 ZoneList<CharacterRange>* result,
1251 bool negate, Zone* zone) {
1252 if (NameEquals(name,
"Any")) {
1257 result->Add(CharacterRange::Everything(), zone);
1259 }
else if (NameEquals(name,
"ASCII")) {
1260 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1261 : CharacterRange::Range(0x0, 0x7F),
1263 }
else if (NameEquals(name,
"Assigned")) {
1264 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY,
"Unassigned",
1265 !negate, result, zone);
1274 bool IsSupportedBinaryProperty(UProperty property) {
1276 case UCHAR_ALPHABETIC:
1279 case UCHAR_ASCII_HEX_DIGIT:
1281 case UCHAR_BIDI_CONTROL:
1282 case UCHAR_BIDI_MIRRORED:
1283 case UCHAR_CASE_IGNORABLE:
1285 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1286 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1287 case UCHAR_CHANGES_WHEN_LOWERCASED:
1288 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1289 case UCHAR_CHANGES_WHEN_TITLECASED:
1290 case UCHAR_CHANGES_WHEN_UPPERCASED:
1292 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1293 case UCHAR_DEPRECATED:
1294 case UCHAR_DIACRITIC:
1296 case UCHAR_EMOJI_COMPONENT:
1297 case UCHAR_EMOJI_MODIFIER_BASE:
1298 case UCHAR_EMOJI_MODIFIER:
1299 case UCHAR_EMOJI_PRESENTATION:
1300 case UCHAR_EXTENDED_PICTOGRAPHIC:
1301 case UCHAR_EXTENDER:
1302 case UCHAR_GRAPHEME_BASE:
1303 case UCHAR_GRAPHEME_EXTEND:
1304 case UCHAR_HEX_DIGIT:
1305 case UCHAR_ID_CONTINUE:
1306 case UCHAR_ID_START:
1307 case UCHAR_IDEOGRAPHIC:
1308 case UCHAR_IDS_BINARY_OPERATOR:
1309 case UCHAR_IDS_TRINARY_OPERATOR:
1310 case UCHAR_JOIN_CONTROL:
1311 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1312 case UCHAR_LOWERCASE:
1314 case UCHAR_NONCHARACTER_CODE_POINT:
1315 case UCHAR_PATTERN_SYNTAX:
1316 case UCHAR_PATTERN_WHITE_SPACE:
1317 case UCHAR_QUOTATION_MARK:
1319 case UCHAR_REGIONAL_INDICATOR:
1321 case UCHAR_SOFT_DOTTED:
1322 case UCHAR_TERMINAL_PUNCTUATION:
1323 case UCHAR_UNIFIED_IDEOGRAPH:
1324 case UCHAR_UPPERCASE:
1325 case UCHAR_VARIATION_SELECTOR:
1326 case UCHAR_WHITE_SPACE:
1327 case UCHAR_XID_CONTINUE:
1328 case UCHAR_XID_START:
1336 bool IsUnicodePropertyValueCharacter(
char c) {
1343 if (
'a' <= c && c <=
'z')
return true;
1344 if (
'A' <= c && c <=
'Z')
return true;
1345 if (
'0' <= c && c <=
'9')
return true;
1351 bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1352 std::vector<char>* name_2) {
1353 DCHECK(name_1->empty());
1354 DCHECK(name_2->empty());
1363 if (current() ==
'{') {
1365 for (Advance(); current() !=
'}' && current() !=
'='; Advance()) {
1366 if (!IsUnicodePropertyValueCharacter(current()))
return false;
1367 if (!has_next())
return false;
1368 name_1->push_back(static_cast<char>(current()));
1370 if (current() ==
'=') {
1371 for (Advance(); current() !=
'}'; Advance()) {
1372 if (!IsUnicodePropertyValueCharacter(current()))
return false;
1373 if (!has_next())
return false;
1374 name_2->push_back(static_cast<char>(current()));
1376 name_2->push_back(0);
1382 name_1->push_back(0);
1384 DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1385 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1389 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1391 const std::vector<char>& name_1,
1392 const std::vector<char>& name_2) {
1393 if (name_2.empty()) {
1395 const char* name = name_1.data();
1396 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1401 if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1405 UProperty
property = u_getPropertyEnum(name);
1406 if (!IsSupportedBinaryProperty(property))
return false;
1407 if (!IsExactPropertyAlias(name, property))
return false;
1408 return LookupPropertyValueName(property, negate ?
"N" :
"Y",
false, add_to,
1413 const char* property_name = name_1.data();
1414 const char* value_name = name_2.data();
1415 UProperty
property = u_getPropertyEnum(property_name);
1416 if (!IsExactPropertyAlias(property_name, property))
return false;
1417 if (property == UCHAR_GENERAL_CATEGORY) {
1419 property = UCHAR_GENERAL_CATEGORY_MASK;
1420 }
else if (property != UCHAR_SCRIPT &&
1421 property != UCHAR_SCRIPT_EXTENSIONS) {
1424 return LookupPropertyValueName(property, value_name, negate, add_to,
1429 RegExpTree* RegExpParser::GetPropertySequence(
const std::vector<char>& name_1) {
1430 if (!FLAG_harmony_regexp_sequence)
return nullptr;
1431 const char* name = name_1.data();
1432 const uc32* sequence_list =
nullptr;
1433 JSRegExp::Flags flags = JSRegExp::kUnicode;
1434 if (NameEquals(name,
"Emoji_Flag_Sequence")) {
1435 sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1436 }
else if (NameEquals(name,
"Emoji_Tag_Sequence")) {
1437 sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1438 }
else if (NameEquals(name,
"Emoji_ZWJ_Sequence")) {
1439 sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1441 if (sequence_list !=
nullptr) {
1444 RegExpBuilder builder(zone(), flags);
1446 while (*sequence_list != 0) {
1447 builder.AddUnicodeCharacter(*sequence_list);
1451 if (*sequence_list == 0)
break;
1452 builder.NewAlternative();
1454 return builder.ToRegExp();
1457 if (NameEquals(name,
"Emoji_Keycap_Sequence")) {
1460 RegExpBuilder builder(zone(), flags);
1461 ZoneList<CharacterRange>* prefix_ranges =
1462 new (zone()) ZoneList<CharacterRange>(2, zone());
1463 prefix_ranges->Add(CharacterRange::Range(
'0',
'9'), zone());
1464 prefix_ranges->Add(CharacterRange::Singleton(
'#'), zone());
1465 prefix_ranges->Add(CharacterRange::Singleton(
'*'), zone());
1466 builder.AddCharacterClass(
1467 new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
1468 builder.AddCharacter(0xFE0F);
1469 builder.AddCharacter(0x20E3);
1470 return builder.ToRegExp();
1471 }
else if (NameEquals(name,
"Emoji_Modifier_Sequence")) {
1474 RegExpBuilder builder(zone(), flags);
1475 ZoneList<CharacterRange>* modifier_base_ranges =
1476 new (zone()) ZoneList<CharacterRange>(2, zone());
1477 LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE,
"Y",
false,
1478 modifier_base_ranges, zone());
1479 builder.AddCharacterClass(
1480 new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
1481 ZoneList<CharacterRange>* modifier_ranges =
1482 new (zone()) ZoneList<CharacterRange>(2, zone());
1483 LookupPropertyValueName(UCHAR_EMOJI_MODIFIER,
"Y",
false, modifier_ranges,
1485 builder.AddCharacterClass(
1486 new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
1487 return builder.ToRegExp();
1493 #else // V8_INTL_SUPPORT 1495 bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1496 std::vector<char>* name_2) {
1500 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1502 const std::vector<char>& name_1,
1503 const std::vector<char>& name_2) {
1507 RegExpTree* RegExpParser::GetPropertySequence(
const std::vector<char>& name) {
1511 #endif // V8_INTL_SUPPORT 1513 bool RegExpParser::ParseUnlimitedLengthHexNumber(
int max_value, uc32* value) {
1515 int d = HexValue(current());
1521 if (x > max_value) {
1525 d = HexValue(current());
1532 uc32 RegExpParser::ParseClassCharacterEscape() {
1533 DCHECK_EQ(
'\\', current());
1534 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1536 switch (current()) {
1558 uc32 controlLetter = Next();
1559 uc32 letter = controlLetter & ~(
'A' ^
'a');
1563 if (letter >=
'A' && letter <=
'Z') {
1567 return controlLetter & 0x1F;
1571 ReportError(CStrVector(
"Invalid class escape"));
1574 if ((controlLetter >=
'0' && controlLetter <=
'9') ||
1575 controlLetter ==
'_') {
1577 return controlLetter & 0x1F;
1586 if (unicode() && !(Next() >=
'0' && Next() <=
'9')) {
1604 ReportError(CStrVector(
"Invalid class escape"));
1607 return ParseOctalLiteral();
1611 if (ParseHexEscape(2, &value))
return value;
1614 ReportError(CStrVector(
"Invalid escape"));
1624 if (ParseUnicodeEscape(&value))
return value;
1627 ReportError(CStrVector(
"Invalid unicode escape"));
1635 uc32 result = current();
1638 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result ==
'-') {
1642 ReportError(CStrVector(
"Invalid escape"));
1649 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1651 bool add_unicode_case_equivalents,
1652 uc32* char_out,
bool* is_class_escape) {
1653 uc32 current_char = current();
1654 if (current_char ==
'\\') {
1662 CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1663 add_unicode_case_equivalents, zone);
1665 *is_class_escape =
true;
1669 ReportError(CStrVector(
"\\ at end of pattern"));
1674 bool negate = Next() ==
'P';
1676 std::vector<char> name_1, name_2;
1677 if (!ParsePropertyClassName(&name_1, &name_2) ||
1678 !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1679 ReportError(CStrVector(
"Invalid property name in character class"));
1681 *is_class_escape =
true;
1688 *char_out = ParseClassCharacterEscape();
1689 *is_class_escape =
false;
1692 *char_out = current_char;
1693 *is_class_escape =
false;
1697 RegExpTree* RegExpParser::ParseCharacterClass(
const RegExpBuilder* builder) {
1698 static const char* kUnterminated =
"Unterminated character class";
1699 static const char* kRangeInvalid =
"Invalid character class";
1700 static const char* kRangeOutOfOrder =
"Range out of order in character class";
1702 DCHECK_EQ(current(),
'[');
1704 bool is_negated =
false;
1705 if (current() ==
'^') {
1709 ZoneList<CharacterRange>* ranges =
1710 new (zone()) ZoneList<CharacterRange>(2, zone());
1711 bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1712 while (has_more() && current() !=
']') {
1713 uc32 char_1, char_2;
1714 bool is_class_1, is_class_2;
1715 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1716 &is_class_1 CHECK_FAILED);
1717 if (current() ==
'-') {
1719 if (current() == kEndMarker) {
1723 }
else if (current() ==
']') {
1724 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1725 ranges->Add(CharacterRange::Singleton(
'-'), zone());
1728 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1729 &is_class_2 CHECK_FAILED);
1730 if (is_class_1 || is_class_2) {
1734 return ReportError(CStrVector(kRangeInvalid));
1736 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1737 ranges->Add(CharacterRange::Singleton(
'-'), zone());
1738 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1742 if (char_1 > char_2) {
1743 return ReportError(CStrVector(kRangeOutOfOrder));
1745 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1747 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1751 return ReportError(CStrVector(kUnterminated));
1754 RegExpCharacterClass::CharacterClassFlags character_class_flags;
1755 if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1756 return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1757 character_class_flags);
1764 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1765 FlatStringReader* input, JSRegExp::Flags flags,
1766 RegExpCompileData* result) {
1767 DCHECK(result !=
nullptr);
1768 RegExpParser parser(input, &result->error, flags, isolate, zone);
1769 RegExpTree* tree = parser.ParsePattern();
1770 if (parser.failed()) {
1771 DCHECK(tree ==
nullptr);
1772 DCHECK(!result->error.is_null());
1774 DCHECK(tree !=
nullptr);
1775 DCHECK(result->error.is_null());
1776 if (FLAG_trace_regexp_parser) {
1778 tree->Print(os, zone);
1781 result->tree = tree;
1782 int capture_count = parser.captures_started();
1783 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1784 result->contains_anchor = parser.contains_anchor();
1785 result->capture_name_map = parser.CreateCaptureNameMap();
1786 result->capture_count = capture_count;
1788 return !parser.failed();
1791 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1793 pending_empty_(false),
1795 characters_(nullptr),
1796 pending_surrogate_(kNoPendingSurrogate),
1801 last_added_(ADD_NONE)
1807 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1808 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1809 FlushPendingSurrogate();
1811 pending_surrogate_ = lead_surrogate;
1815 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1816 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1817 if (pending_surrogate_ != kNoPendingSurrogate) {
1818 uc16 lead_surrogate = pending_surrogate_;
1819 pending_surrogate_ = kNoPendingSurrogate;
1820 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1822 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1823 if (NeedsDesugaringForIgnoreCase(combined)) {
1824 AddCharacterClassForDesugaring(combined);
1826 ZoneList<uc16> surrogate_pair(2, zone());
1827 surrogate_pair.Add(lead_surrogate, zone());
1828 surrogate_pair.Add(trail_surrogate, zone());
1830 new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1834 pending_surrogate_ = trail_surrogate;
1835 FlushPendingSurrogate();
1840 void RegExpBuilder::FlushPendingSurrogate() {
1841 if (pending_surrogate_ != kNoPendingSurrogate) {
1843 uc32 c = pending_surrogate_;
1844 pending_surrogate_ = kNoPendingSurrogate;
1845 AddCharacterClassForDesugaring(c);
1850 void RegExpBuilder::FlushCharacters() {
1851 FlushPendingSurrogate();
1852 pending_empty_ =
false;
1853 if (characters_ !=
nullptr) {
1855 new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1856 characters_ =
nullptr;
1857 text_.Add(atom, zone());
1863 void RegExpBuilder::FlushText() {
1865 int num_text = text_.length();
1866 if (num_text == 0) {
1868 }
else if (num_text == 1) {
1869 terms_.Add(text_.last(), zone());
1871 RegExpText* text =
new (zone()) RegExpText(zone());
1872 for (
int i = 0;
i < num_text;
i++) text_.Get(
i)->AppendToText(text, zone());
1873 terms_.Add(text, zone());
1879 void RegExpBuilder::AddCharacter(uc16 c) {
1880 FlushPendingSurrogate();
1881 pending_empty_ =
false;
1882 if (NeedsDesugaringForIgnoreCase(c)) {
1883 AddCharacterClassForDesugaring(c);
1885 if (characters_ ==
nullptr) {
1886 characters_ =
new (zone()) ZoneList<uc16>(4, zone());
1888 characters_->Add(c, zone());
1894 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1895 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1897 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1898 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1899 }
else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1900 AddLeadSurrogate(c);
1901 }
else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1902 AddTrailSurrogate(c);
1904 AddCharacter(static_cast<uc16>(c));
1908 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1911 FlushPendingSurrogate();
1912 AddUnicodeCharacter(character);
1913 FlushPendingSurrogate();
1916 void RegExpBuilder::AddEmpty() { pending_empty_ =
true; }
1919 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1920 if (NeedsDesugaringForUnicode(cc)) {
1929 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1930 AddTerm(
new (zone()) RegExpCharacterClass(
1931 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1936 void RegExpBuilder::AddAtom(RegExpTree* term) {
1937 if (term->IsEmpty()) {
1941 if (term->IsTextElement()) {
1943 text_.Add(term, zone());
1946 terms_.Add(term, zone());
1952 void RegExpBuilder::AddTerm(RegExpTree* term) {
1954 terms_.Add(term, zone());
1959 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1961 if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1963 RegExpAssertion* last = terms_.last()->AsAssertion();
1964 RegExpAssertion* next = assert->AsAssertion();
1965 if (last->assertion_type() == next->assertion_type())
return;
1967 terms_.Add(assert, zone());
1972 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1975 void RegExpBuilder::FlushTerms() {
1977 int num_terms = terms_.length();
1978 RegExpTree* alternative;
1979 if (num_terms == 0) {
1980 alternative =
new (zone()) RegExpEmpty();
1981 }
else if (num_terms == 1) {
1982 alternative = terms_.last();
1984 alternative =
new (zone()) RegExpAlternative(terms_.GetList(zone()));
1986 alternatives_.Add(alternative, zone());
1992 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1993 if (!unicode())
return false;
1997 if (ignore_case())
return true;
1998 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1999 CharacterRange::Canonicalize(ranges);
2000 for (
int i = ranges->length() - 1;
i >= 0;
i--) {
2001 uc32 from = ranges->at(
i).from();
2002 uc32 to = ranges->at(
i).to();
2004 if (to >= kNonBmpStart)
return true;
2006 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart)
return true;
2012 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2013 #ifdef V8_INTL_SUPPORT 2014 if (unicode() && ignore_case()) {
2015 icu::UnicodeSet
set(c, c);
2016 set.closeOver(USET_CASE_INSENSITIVE);
2017 set.removeAllStrings();
2018 return set.size() > 1;
2022 #endif // V8_INTL_SUPPORT 2027 RegExpTree* RegExpBuilder::ToRegExp() {
2029 int num_alternatives = alternatives_.length();
2030 if (num_alternatives == 0)
return new (zone()) RegExpEmpty();
2031 if (num_alternatives == 1)
return alternatives_.last();
2032 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
2035 bool RegExpBuilder::AddQuantifierToAtom(
2036 int min,
int max, RegExpQuantifier::QuantifierType quantifier_type) {
2037 FlushPendingSurrogate();
2038 if (pending_empty_) {
2039 pending_empty_ =
false;
2043 if (characters_ !=
nullptr) {
2044 DCHECK(last_added_ == ADD_CHAR);
2046 Vector<const uc16> char_vector = characters_->ToConstVector();
2047 int num_chars = char_vector.length();
2048 if (num_chars > 1) {
2049 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2050 text_.Add(
new (zone()) RegExpAtom(prefix, flags_), zone());
2051 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2053 characters_ =
nullptr;
2054 atom =
new (zone()) RegExpAtom(char_vector, flags_);
2056 }
else if (text_.length() > 0) {
2057 DCHECK(last_added_ == ADD_ATOM);
2058 atom = text_.RemoveLast();
2060 }
else if (terms_.length() > 0) {
2061 DCHECK(last_added_ == ADD_ATOM);
2062 atom = terms_.RemoveLast();
2063 if (atom->IsLookaround()) {
2065 if (unicode())
return false;
2067 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2071 if (atom->max_match() == 0) {
2077 terms_.Add(atom, zone());
2084 terms_.Add(
new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),