V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
regexp-parser.h
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 #define V8_REGEXP_REGEXP_PARSER_H_
7 
8 #include "src/objects.h"
9 #include "src/objects/js-regexp.h"
10 #include "src/regexp/regexp-ast.h"
11 #include "src/zone/zone.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 struct RegExpCompileData;
17 
18 // A BufferedZoneList is an automatically growing list, just like (and backed
19 // by) a ZoneList, that is optimized for the case of adding and removing
20 // a single element. The last element added is stored outside the backing list,
21 // and if no more than one element is ever added, the ZoneList isn't even
22 // allocated.
23 // Elements must not be nullptr pointers.
24 template <typename T, int initial_size>
26  public:
27  BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28 
29  // Adds element at end of list. This element is buffered and can
30  // be read using last() or removed using RemoveLast until a new Add or until
31  // RemoveLast or GetList has been called.
32  void Add(T* value, Zone* zone) {
33  if (last_ != nullptr) {
34  if (list_ == nullptr) {
35  list_ = new (zone) ZoneList<T*>(initial_size, zone);
36  }
37  list_->Add(last_, zone);
38  }
39  last_ = value;
40  }
41 
42  T* last() {
43  DCHECK(last_ != nullptr);
44  return last_;
45  }
46 
47  T* RemoveLast() {
48  DCHECK(last_ != nullptr);
49  T* result = last_;
50  if ((list_ != nullptr) && (list_->length() > 0))
51  last_ = list_->RemoveLast();
52  else
53  last_ = nullptr;
54  return result;
55  }
56 
57  T* Get(int i) {
58  DCHECK((0 <= i) && (i < length()));
59  if (list_ == nullptr) {
60  DCHECK_EQ(0, i);
61  return last_;
62  } else {
63  if (i == list_->length()) {
64  DCHECK(last_ != nullptr);
65  return last_;
66  } else {
67  return list_->at(i);
68  }
69  }
70  }
71 
72  void Clear() {
73  list_ = nullptr;
74  last_ = nullptr;
75  }
76 
77  int length() {
78  int length = (list_ == nullptr) ? 0 : list_->length();
79  return length + ((last_ == nullptr) ? 0 : 1);
80  }
81 
82  ZoneList<T*>* GetList(Zone* zone) {
83  if (list_ == nullptr) {
84  list_ = new (zone) ZoneList<T*>(initial_size, zone);
85  }
86  if (last_ != nullptr) {
87  list_->Add(last_, zone);
88  last_ = nullptr;
89  }
90  return list_;
91  }
92 
93  private:
94  ZoneList<T*>* list_;
95  T* last_;
96 };
97 
98 
99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 class RegExpBuilder : public ZoneObject {
101  public:
102  RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103  void AddCharacter(uc16 character);
104  void AddUnicodeCharacter(uc32 character);
105  void AddEscapedUnicodeCharacter(uc32 character);
106  // "Adds" an empty expression. Does nothing except consume a
107  // following quantifier
108  void AddEmpty();
109  void AddCharacterClass(RegExpCharacterClass* cc);
110  void AddCharacterClassForDesugaring(uc32 c);
111  void AddAtom(RegExpTree* tree);
112  void AddTerm(RegExpTree* tree);
113  void AddAssertion(RegExpTree* tree);
114  void NewAlternative(); // '|'
115  bool AddQuantifierToAtom(int min, int max,
116  RegExpQuantifier::QuantifierType type);
117  void FlushText();
118  RegExpTree* ToRegExp();
119  JSRegExp::Flags flags() const { return flags_; }
120  void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
121 
122  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
123  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
124  bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
125 
126  private:
127  static const uc16 kNoPendingSurrogate = 0;
128  void AddLeadSurrogate(uc16 lead_surrogate);
129  void AddTrailSurrogate(uc16 trail_surrogate);
130  void FlushPendingSurrogate();
131  void FlushCharacters();
132  void FlushTerms();
133  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
134  bool NeedsDesugaringForIgnoreCase(uc32 c);
135  Zone* zone() const { return zone_; }
136  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
137 
138  Zone* zone_;
139  bool pending_empty_;
140  JSRegExp::Flags flags_;
141  ZoneList<uc16>* characters_;
142  uc16 pending_surrogate_;
145  BufferedZoneList<RegExpTree, 2> alternatives_;
146 #ifdef DEBUG
147  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
148 #define LAST(x) last_added_ = x;
149 #else
150 #define LAST(x)
151 #endif
152 };
153 
155  public:
157  JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
158 
159  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
160  JSRegExp::Flags flags, RegExpCompileData* result);
161 
162  RegExpTree* ParsePattern();
163  RegExpTree* ParseDisjunction();
164  RegExpTree* ParseGroup();
165 
166  // Parses a {...,...} quantifier and stores the range in the given
167  // out parameters.
168  bool ParseIntervalQuantifier(int* min_out, int* max_out);
169 
170  // Parses and returns a single escaped character. The character
171  // must not be 'b' or 'B' since they are usually handle specially.
172  uc32 ParseClassCharacterEscape();
173 
174  // Checks whether the following is a length-digit hexadecimal number,
175  // and sets the value if it is.
176  bool ParseHexEscape(int length, uc32* value);
177  bool ParseUnicodeEscape(uc32* value);
178  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
179 
180  bool ParsePropertyClassName(std::vector<char>* name_1,
181  std::vector<char>* name_2);
182  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
183  const std::vector<char>& name_1,
184  const std::vector<char>& name_2);
185 
186  RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
187  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
188 
189  uc32 ParseOctalLiteral();
190 
191  // Tries to parse the input as a back reference. If successful it
192  // stores the result in the output parameter and returns true. If
193  // it fails it will push back the characters read so the same characters
194  // can be reparsed.
195  bool ParseBackReferenceIndex(int* index_out);
196 
197  // Parse inside a class. Either add escaped class to the range, or return
198  // false and pass parsed single character through |char_out|.
199  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
200  bool add_unicode_case_equivalents, uc32* char_out,
201  bool* is_class_escape);
202 
203  char ParseClassEscape();
204 
205  RegExpTree* ReportError(Vector<const char> message);
206  void Advance();
207  void Advance(int dist);
208  void Reset(int pos);
209 
210  // Reports whether the pattern might be used as a literal search string.
211  // Only use if the result of the parse is a single atom node.
212  bool simple();
213  bool contains_anchor() { return contains_anchor_; }
214  void set_contains_anchor() { contains_anchor_ = true; }
215  int captures_started() { return captures_started_; }
216  int position() { return next_pos_ - 1; }
217  bool failed() { return failed_; }
218  // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
219  // just read the initial flag value here.
220  bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
221 
222  static bool IsSyntaxCharacterOrSlash(uc32 c);
223 
224  static const int kMaxCaptures = 1 << 16;
225  static const uc32 kEndMarker = (1 << 21);
226 
227  private:
228  enum SubexpressionType {
229  INITIAL,
230  CAPTURE, // All positive values represent captures.
231  POSITIVE_LOOKAROUND,
232  NEGATIVE_LOOKAROUND,
233  GROUPING
234  };
235 
236  class RegExpParserState : public ZoneObject {
237  public:
238  // Push a state on the stack.
239  RegExpParserState(RegExpParserState* previous_state,
240  SubexpressionType group_type,
241  RegExpLookaround::Type lookaround_type,
242  int disjunction_capture_index,
243  const ZoneVector<uc16>* capture_name,
244  JSRegExp::Flags flags, Zone* zone)
245  : previous_state_(previous_state),
246  builder_(new (zone) RegExpBuilder(zone, flags)),
247  group_type_(group_type),
248  lookaround_type_(lookaround_type),
249  disjunction_capture_index_(disjunction_capture_index),
250  capture_name_(capture_name) {}
251  // Parser state of containing expression, if any.
252  RegExpParserState* previous_state() const { return previous_state_; }
253  bool IsSubexpression() { return previous_state_ != nullptr; }
254  // RegExpBuilder building this regexp's AST.
255  RegExpBuilder* builder() const { return builder_; }
256  // Type of regexp being parsed (parenthesized group or entire regexp).
257  SubexpressionType group_type() const { return group_type_; }
258  // Lookahead or Lookbehind.
259  RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
260  // Index in captures array of first capture in this sub-expression, if any.
261  // Also the capture index of this sub-expression itself, if group_type
262  // is CAPTURE.
263  int capture_index() const { return disjunction_capture_index_; }
264  // The name of the current sub-expression, if group_type is CAPTURE. Only
265  // used for named captures.
266  const ZoneVector<uc16>* capture_name() const { return capture_name_; }
267 
268  bool IsNamedCapture() const { return capture_name_ != nullptr; }
269 
270  // Check whether the parser is inside a capture group with the given index.
271  bool IsInsideCaptureGroup(int index);
272  // Check whether the parser is inside a capture group with the given name.
273  bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
274 
275  private:
276  // Linked list implementation of stack of states.
277  RegExpParserState* const previous_state_;
278  // Builder for the stored disjunction.
279  RegExpBuilder* const builder_;
280  // Stored disjunction type (capture, look-ahead or grouping), if any.
281  const SubexpressionType group_type_;
282  // Stored read direction.
283  const RegExpLookaround::Type lookaround_type_;
284  // Stored disjunction's capture index (if any).
285  const int disjunction_capture_index_;
286  // Stored capture name (if any).
287  const ZoneVector<uc16>* const capture_name_;
288  };
289 
290  // Return the 1-indexed RegExpCapture object, allocate if necessary.
291  RegExpCapture* GetCapture(int index);
292 
293  // Creates a new named capture at the specified index. Must be called exactly
294  // once for each named capture. Fails if a capture with the same name is
295  // encountered.
296  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
297 
298  // Parses the name of a capture group (?<name>pattern). The name must adhere
299  // to IdentifierName in the ECMAScript standard.
300  const ZoneVector<uc16>* ParseCaptureGroupName();
301 
302  bool ParseNamedBackReference(RegExpBuilder* builder,
303  RegExpParserState* state);
304  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
305 
306  // After the initial parsing pass, patch corresponding RegExpCapture objects
307  // into all RegExpBackReferences. This is done after initial parsing in order
308  // to avoid complicating cases in which references comes before the capture.
309  void PatchNamedBackReferences();
310 
311  Handle<FixedArray> CreateCaptureNameMap();
312 
313  // Returns true iff the pattern contains named captures. May call
314  // ScanForCaptures to look ahead at the remaining pattern.
315  bool HasNamedCaptures();
316 
317  Isolate* isolate() { return isolate_; }
318  Zone* zone() const { return zone_; }
319 
320  uc32 current() { return current_; }
321  bool has_more() { return has_more_; }
322  bool has_next() { return next_pos_ < in()->length(); }
323  uc32 Next();
324  template <bool update_position>
325  uc32 ReadNext();
326  FlatStringReader* in() { return in_; }
327  void ScanForCaptures();
328 
329  Isolate* isolate_;
330  Zone* zone_;
331  Handle<String>* error_;
332  ZoneList<RegExpCapture*>* captures_;
333  ZoneList<RegExpCapture*>* named_captures_;
334  ZoneList<RegExpBackReference*>* named_back_references_;
335  FlatStringReader* in_;
336  uc32 current_;
337  // These are the flags specified outside the regexp syntax ie after the
338  // terminating '/' or in the second argument to the constructor. The current
339  // flags are stored on the RegExpBuilder.
340  JSRegExp::Flags top_level_flags_;
341  int next_pos_;
342  int captures_started_;
343  int capture_count_; // Only valid after we have scanned for captures.
344  bool has_more_;
345  bool simple_;
346  bool contains_anchor_;
347  bool is_scanned_for_captures_;
348  bool has_named_captures_; // Only valid after we have scanned for captures.
349  bool failed_;
350 };
351 
352 } // namespace internal
353 } // namespace v8
354 
355 #endif // V8_REGEXP_REGEXP_PARSER_H_
Definition: libplatform.h:13