V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
regexp-ast.h
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_AST_H_
6 #define V8_REGEXP_REGEXP_AST_H_
7 
8 #include "src/objects.h"
9 #include "src/objects/js-regexp.h"
10 #include "src/objects/string.h"
11 #include "src/utils.h"
12 #include "src/zone/zone-containers.h"
13 #include "src/zone/zone.h"
14 
15 namespace v8 {
16 namespace internal {
17 
18 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
19  VISIT(Disjunction) \
20  VISIT(Alternative) \
21  VISIT(Assertion) \
22  VISIT(CharacterClass) \
23  VISIT(Atom) \
24  VISIT(Quantifier) \
25  VISIT(Capture) \
26  VISIT(Group) \
27  VISIT(Lookaround) \
28  VISIT(BackReference) \
29  VISIT(Empty) \
30  VISIT(Text)
31 
32 #define FORWARD_DECLARE(Name) class RegExp##Name;
33 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
34 #undef FORWARD_DECLARE
35 
36 class RegExpCompiler;
37 class RegExpNode;
38 class RegExpTree;
39 
41  public:
42  virtual ~RegExpVisitor() = default;
43 #define MAKE_CASE(Name) \
44  virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
45  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
46 #undef MAKE_CASE
47 };
48 
49 
50 // A simple closed interval.
51 class Interval {
52  public:
53  Interval() : from_(kNone), to_(kNone) {}
54  Interval(int from, int to) : from_(from), to_(to) {}
55  Interval Union(Interval that) {
56  if (that.from_ == kNone)
57  return *this;
58  else if (from_ == kNone)
59  return that;
60  else
61  return Interval(Min(from_, that.from_), Max(to_, that.to_));
62  }
63  bool Contains(int value) { return (from_ <= value) && (value <= to_); }
64  bool is_empty() { return from_ == kNone; }
65  int from() const { return from_; }
66  int to() const { return to_; }
67  static Interval Empty() { return Interval(); }
68  static const int kNone = -1;
69 
70  private:
71  int from_;
72  int to_;
73 };
74 
75 
76 // Represents code units in the range from from_ to to_, both ends are
77 // inclusive.
79  public:
80  CharacterRange() : from_(0), to_(0) {}
81  // For compatibility with the CHECK_OK macro
82  CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
83  static void AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
84  Zone* zone);
85  // Add class escapes. Add case equivalent closure for \w and \W if necessary.
86  static void AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
87  bool add_unicode_case_equivalents, Zone* zone);
88  static Vector<const int> GetWordBounds();
89  static inline CharacterRange Singleton(uc32 value) {
90  return CharacterRange(value, value);
91  }
92  static inline CharacterRange Range(uc32 from, uc32 to) {
93  DCHECK(0 <= from && to <= String::kMaxCodePoint);
94  DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
95  return CharacterRange(from, to);
96  }
97  static inline CharacterRange Everything() {
98  return CharacterRange(0, String::kMaxCodePoint);
99  }
100  static inline ZoneList<CharacterRange>* List(Zone* zone,
101  CharacterRange range) {
103  new (zone) ZoneList<CharacterRange>(1, zone);
104  list->Add(range, zone);
105  return list;
106  }
107  bool Contains(uc32 i) { return from_ <= i && i <= to_; }
108  uc32 from() const { return from_; }
109  void set_from(uc32 value) { from_ = value; }
110  uc32 to() const { return to_; }
111  void set_to(uc32 value) { to_ = value; }
112  bool is_valid() { return from_ <= to_; }
113  bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; }
114  bool IsSingleton() { return (from_ == to_); }
115  static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
116  ZoneList<CharacterRange>* ranges,
117  bool is_one_byte);
118  // Whether a range list is in canonical form: Ranges ordered by from value,
119  // and ranges non-overlapping and non-adjacent.
120  static bool IsCanonical(ZoneList<CharacterRange>* ranges);
121  // Convert range list to canonical form. The characters covered by the ranges
122  // will still be the same, but no character is in more than one range, and
123  // adjacent ranges are merged. The resulting list may be shorter than the
124  // original, but cannot be longer.
125  static void Canonicalize(ZoneList<CharacterRange>* ranges);
126  // Negate the contents of a character range in canonical form.
127  static void Negate(ZoneList<CharacterRange>* src,
128  ZoneList<CharacterRange>* dst, Zone* zone);
129  static const int kStartMarker = (1 << 24);
130  static const int kPayloadMask = (1 << 24) - 1;
131 
132  private:
133  CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
134 
135  uc32 from_;
136  uc32 to_;
137 };
138 
139 class CharacterSet final {
140  public:
141  explicit CharacterSet(uc16 standard_set_type)
142  : ranges_(nullptr), standard_set_type_(standard_set_type) {}
143  explicit CharacterSet(ZoneList<CharacterRange>* ranges)
144  : ranges_(ranges), standard_set_type_(0) {}
145  ZoneList<CharacterRange>* ranges(Zone* zone);
146  uc16 standard_set_type() const { return standard_set_type_; }
147  void set_standard_set_type(uc16 special_set_type) {
148  standard_set_type_ = special_set_type;
149  }
150  bool is_standard() { return standard_set_type_ != 0; }
151  void Canonicalize();
152 
153  private:
154  ZoneList<CharacterRange>* ranges_;
155  // If non-zero, the value represents a standard set (e.g., all whitespace
156  // characters) without having to expand the ranges.
157  uc16 standard_set_type_;
158 };
159 
160 class TextElement final {
161  public:
162  enum TextType { ATOM, CHAR_CLASS };
163 
164  static TextElement Atom(RegExpAtom* atom);
165  static TextElement CharClass(RegExpCharacterClass* char_class);
166 
167  int cp_offset() const { return cp_offset_; }
168  void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
169  int length() const;
170 
171  TextType text_type() const { return text_type_; }
172 
173  RegExpTree* tree() const { return tree_; }
174 
175  RegExpAtom* atom() const {
176  DCHECK(text_type() == ATOM);
177  return reinterpret_cast<RegExpAtom*>(tree());
178  }
179 
180  RegExpCharacterClass* char_class() const {
181  DCHECK(text_type() == CHAR_CLASS);
182  return reinterpret_cast<RegExpCharacterClass*>(tree());
183  }
184 
185  private:
186  TextElement(TextType text_type, RegExpTree* tree)
187  : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
188 
189  int cp_offset_;
190  TextType text_type_;
191  RegExpTree* tree_;
192 };
193 
194 
195 class RegExpTree : public ZoneObject {
196  public:
197  static const int kInfinity = kMaxInt;
198  virtual ~RegExpTree() = default;
199  virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
200  virtual RegExpNode* ToNode(RegExpCompiler* compiler,
201  RegExpNode* on_success) = 0;
202  virtual bool IsTextElement() { return false; }
203  virtual bool IsAnchoredAtStart() { return false; }
204  virtual bool IsAnchoredAtEnd() { return false; }
205  virtual int min_match() = 0;
206  virtual int max_match() = 0;
207  // Returns the interval of registers used for captures within this
208  // expression.
209  virtual Interval CaptureRegisters() { return Interval::Empty(); }
210  virtual void AppendToText(RegExpText* text, Zone* zone);
211  std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT
212 #define MAKE_ASTYPE(Name) \
213  virtual RegExp##Name* As##Name(); \
214  virtual bool Is##Name();
215  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
216 #undef MAKE_ASTYPE
217 };
218 
219 
220 class RegExpDisjunction final : public RegExpTree {
221  public:
222  explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
223  void* Accept(RegExpVisitor* visitor, void* data) override;
224  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
225  RegExpDisjunction* AsDisjunction() override;
226  Interval CaptureRegisters() override;
227  bool IsDisjunction() override;
228  bool IsAnchoredAtStart() override;
229  bool IsAnchoredAtEnd() override;
230  int min_match() override { return min_match_; }
231  int max_match() override { return max_match_; }
232  ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
233 
234  private:
235  bool SortConsecutiveAtoms(RegExpCompiler* compiler);
236  void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
237  void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
238  ZoneList<RegExpTree*>* alternatives_;
239  int min_match_;
240  int max_match_;
241 };
242 
243 
244 class RegExpAlternative final : public RegExpTree {
245  public:
246  explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
247  void* Accept(RegExpVisitor* visitor, void* data) override;
248  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
249  RegExpAlternative* AsAlternative() override;
250  Interval CaptureRegisters() override;
251  bool IsAlternative() override;
252  bool IsAnchoredAtStart() override;
253  bool IsAnchoredAtEnd() override;
254  int min_match() override { return min_match_; }
255  int max_match() override { return max_match_; }
256  ZoneList<RegExpTree*>* nodes() { return nodes_; }
257 
258  private:
259  ZoneList<RegExpTree*>* nodes_;
260  int min_match_;
261  int max_match_;
262 };
263 
264 
265 class RegExpAssertion final : public RegExpTree {
266  public:
267  enum AssertionType {
268  START_OF_LINE,
269  START_OF_INPUT,
270  END_OF_LINE,
271  END_OF_INPUT,
272  BOUNDARY,
273  NON_BOUNDARY
274  };
275  RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
276  : assertion_type_(type), flags_(flags) {}
277  void* Accept(RegExpVisitor* visitor, void* data) override;
278  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
279  RegExpAssertion* AsAssertion() override;
280  bool IsAssertion() override;
281  bool IsAnchoredAtStart() override;
282  bool IsAnchoredAtEnd() override;
283  int min_match() override { return 0; }
284  int max_match() override { return 0; }
285  AssertionType assertion_type() { return assertion_type_; }
286 
287  private:
288  const AssertionType assertion_type_;
289  const JSRegExp::Flags flags_;
290 };
291 
292 
293 class RegExpCharacterClass final : public RegExpTree {
294  public:
295  // NEGATED: The character class is negated and should match everything but
296  // the specified ranges.
297  // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
298  // surrogate and should not be unicode-desugared (crbug.com/641091).
299  enum Flag {
300  NEGATED = 1 << 0,
301  CONTAINS_SPLIT_SURROGATE = 1 << 1,
302  };
304 
306  Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
307  CharacterClassFlags character_class_flags = CharacterClassFlags())
308  : set_(ranges),
309  flags_(flags),
310  character_class_flags_(character_class_flags) {
311  // Convert the empty set of ranges to the negated Everything() range.
312  if (ranges->is_empty()) {
313  ranges->Add(CharacterRange::Everything(), zone);
314  character_class_flags_ ^= NEGATED;
315  }
316  }
318  : set_(type),
319  flags_(flags),
320  character_class_flags_(CharacterClassFlags()) {}
321  void* Accept(RegExpVisitor* visitor, void* data) override;
322  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
323  RegExpCharacterClass* AsCharacterClass() override;
324  bool IsCharacterClass() override;
325  bool IsTextElement() override { return true; }
326  int min_match() override { return 1; }
327  // The character class may match two code units for unicode regexps.
328  // TODO(yangguo): we should split this class for usage in TextElement, and
329  // make max_match() dependent on the character class content.
330  int max_match() override { return 2; }
331  void AppendToText(RegExpText* text, Zone* zone) override;
332  CharacterSet character_set() { return set_; }
333  // TODO(lrn): Remove need for complex version if is_standard that
334  // recognizes a mangled standard set and just do { return set_.is_special(); }
335  bool is_standard(Zone* zone);
336  // Returns a value representing the standard character set if is_standard()
337  // returns true.
338  // Currently used values are:
339  // s : unicode whitespace
340  // S : unicode non-whitespace
341  // w : ASCII word character (digit, letter, underscore)
342  // W : non-ASCII word character
343  // d : ASCII digit
344  // D : non-ASCII digit
345  // . : non-newline
346  // * : All characters, for advancing unanchored regexp
347  uc16 standard_type() const { return set_.standard_set_type(); }
348  ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
349  bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
350  JSRegExp::Flags flags() const { return flags_; }
351  bool contains_split_surrogate() const {
352  return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
353  }
354 
355  private:
356  CharacterSet set_;
357  const JSRegExp::Flags flags_;
358  CharacterClassFlags character_class_flags_;
359 };
360 
361 
362 class RegExpAtom final : public RegExpTree {
363  public:
364  explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
365  : data_(data), flags_(flags) {}
366  void* Accept(RegExpVisitor* visitor, void* data) override;
367  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
368  RegExpAtom* AsAtom() override;
369  bool IsAtom() override;
370  bool IsTextElement() override { return true; }
371  int min_match() override { return data_.length(); }
372  int max_match() override { return data_.length(); }
373  void AppendToText(RegExpText* text, Zone* zone) override;
374  Vector<const uc16> data() { return data_; }
375  int length() { return data_.length(); }
376  JSRegExp::Flags flags() const { return flags_; }
377  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
378 
379  private:
380  Vector<const uc16> data_;
381  const JSRegExp::Flags flags_;
382 };
383 
384 
385 class RegExpText final : public RegExpTree {
386  public:
387  explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
388  void* Accept(RegExpVisitor* visitor, void* data) override;
389  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
390  RegExpText* AsText() override;
391  bool IsText() override;
392  bool IsTextElement() override { return true; }
393  int min_match() override { return length_; }
394  int max_match() override { return length_; }
395  void AppendToText(RegExpText* text, Zone* zone) override;
396  void AddElement(TextElement elm, Zone* zone) {
397  elements_.Add(elm, zone);
398  length_ += elm.length();
399  }
400  ZoneList<TextElement>* elements() { return &elements_; }
401 
402  private:
403  ZoneList<TextElement> elements_;
404  int length_;
405 };
406 
407 
408 class RegExpQuantifier final : public RegExpTree {
409  public:
410  enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
411  RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
412  : body_(body),
413  min_(min),
414  max_(max),
415  min_match_(min * body->min_match()),
416  quantifier_type_(type) {
417  if (max > 0 && body->max_match() > kInfinity / max) {
418  max_match_ = kInfinity;
419  } else {
420  max_match_ = max * body->max_match();
421  }
422  }
423  void* Accept(RegExpVisitor* visitor, void* data) override;
424  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
425  static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
426  RegExpCompiler* compiler, RegExpNode* on_success,
427  bool not_at_start = false);
428  RegExpQuantifier* AsQuantifier() override;
429  Interval CaptureRegisters() override;
430  bool IsQuantifier() override;
431  int min_match() override { return min_match_; }
432  int max_match() override { return max_match_; }
433  int min() { return min_; }
434  int max() { return max_; }
435  bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
436  bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
437  bool is_greedy() { return quantifier_type_ == GREEDY; }
438  RegExpTree* body() { return body_; }
439 
440  private:
441  RegExpTree* body_;
442  int min_;
443  int max_;
444  int min_match_;
445  int max_match_;
446  QuantifierType quantifier_type_;
447 };
448 
449 
450 class RegExpCapture final : public RegExpTree {
451  public:
452  explicit RegExpCapture(int index)
453  : body_(nullptr), index_(index), name_(nullptr) {}
454  void* Accept(RegExpVisitor* visitor, void* data) override;
455  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
456  static RegExpNode* ToNode(RegExpTree* body, int index,
457  RegExpCompiler* compiler, RegExpNode* on_success);
458  RegExpCapture* AsCapture() override;
459  bool IsAnchoredAtStart() override;
460  bool IsAnchoredAtEnd() override;
461  Interval CaptureRegisters() override;
462  bool IsCapture() override;
463  int min_match() override { return body_->min_match(); }
464  int max_match() override { return body_->max_match(); }
465  RegExpTree* body() { return body_; }
466  void set_body(RegExpTree* body) { body_ = body; }
467  int index() { return index_; }
468  const ZoneVector<uc16>* name() const { return name_; }
469  void set_name(const ZoneVector<uc16>* name) { name_ = name; }
470  static int StartRegister(int index) { return index * 2; }
471  static int EndRegister(int index) { return index * 2 + 1; }
472 
473  private:
474  RegExpTree* body_;
475  int index_;
476  const ZoneVector<uc16>* name_;
477 };
478 
479 class RegExpGroup final : public RegExpTree {
480  public:
481  explicit RegExpGroup(RegExpTree* body) : body_(body) {}
482  void* Accept(RegExpVisitor* visitor, void* data) override;
483  RegExpNode* ToNode(RegExpCompiler* compiler,
484  RegExpNode* on_success) override {
485  return body_->ToNode(compiler, on_success);
486  }
487  RegExpGroup* AsGroup() override;
488  bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
489  bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
490  bool IsGroup() override;
491  int min_match() override { return body_->min_match(); }
492  int max_match() override { return body_->max_match(); }
493  Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
494  RegExpTree* body() { return body_; }
495 
496  private:
497  RegExpTree* body_;
498 };
499 
500 class RegExpLookaround final : public RegExpTree {
501  public:
502  enum Type { LOOKAHEAD, LOOKBEHIND };
503 
504  RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
505  int capture_from, Type type)
506  : body_(body),
507  is_positive_(is_positive),
508  capture_count_(capture_count),
509  capture_from_(capture_from),
510  type_(type) {}
511 
512  void* Accept(RegExpVisitor* visitor, void* data) override;
513  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
514  RegExpLookaround* AsLookaround() override;
515  Interval CaptureRegisters() override;
516  bool IsLookaround() override;
517  bool IsAnchoredAtStart() override;
518  int min_match() override { return 0; }
519  int max_match() override { return 0; }
520  RegExpTree* body() { return body_; }
521  bool is_positive() { return is_positive_; }
522  int capture_count() { return capture_count_; }
523  int capture_from() { return capture_from_; }
524  Type type() { return type_; }
525 
526  class Builder {
527  public:
528  Builder(bool is_positive, RegExpNode* on_success,
529  int stack_pointer_register, int position_register,
530  int capture_register_count = 0, int capture_register_start = 0);
531  RegExpNode* on_match_success() { return on_match_success_; }
532  RegExpNode* ForMatch(RegExpNode* match);
533 
534  private:
535  bool is_positive_;
536  RegExpNode* on_match_success_;
537  RegExpNode* on_success_;
538  int stack_pointer_register_;
539  int position_register_;
540  };
541 
542  private:
543  RegExpTree* body_;
544  bool is_positive_;
545  int capture_count_;
546  int capture_from_;
547  Type type_;
548 };
549 
550 
551 class RegExpBackReference final : public RegExpTree {
552  public:
553  explicit RegExpBackReference(JSRegExp::Flags flags)
554  : capture_(nullptr), name_(nullptr), flags_(flags) {}
556  : capture_(capture), name_(nullptr), flags_(flags) {}
557  void* Accept(RegExpVisitor* visitor, void* data) override;
558  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
559  RegExpBackReference* AsBackReference() override;
560  bool IsBackReference() override;
561  int min_match() override { return 0; }
562  // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
563  // recursion, we give up. Ignorance is bliss.
564  int max_match() override { return kInfinity; }
565  int index() { return capture_->index(); }
566  RegExpCapture* capture() { return capture_; }
567  void set_capture(RegExpCapture* capture) { capture_ = capture; }
568  const ZoneVector<uc16>* name() const { return name_; }
569  void set_name(const ZoneVector<uc16>* name) { name_ = name; }
570 
571  private:
572  RegExpCapture* capture_;
573  const ZoneVector<uc16>* name_;
574  const JSRegExp::Flags flags_;
575 };
576 
577 
578 class RegExpEmpty final : public RegExpTree {
579  public:
580  RegExpEmpty() = default;
581  void* Accept(RegExpVisitor* visitor, void* data) override;
582  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
583  RegExpEmpty* AsEmpty() override;
584  bool IsEmpty() override;
585  int min_match() override { return 0; }
586  int max_match() override { return 0; }
587 };
588 
589 } // namespace internal
590 } // namespace v8
591 
592 #endif // V8_REGEXP_REGEXP_AST_H_
Definition: libplatform.h:13