V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
jsregexp.h
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_JSREGEXP_H_
6 #define V8_REGEXP_JSREGEXP_H_
7 
8 #include "src/allocation.h"
9 #include "src/isolate.h"
10 #include "src/objects/js-regexp.h"
11 #include "src/regexp/regexp-ast.h"
12 #include "src/regexp/regexp-macro-assembler.h"
13 
14 namespace v8 {
15 namespace internal {
16 
17 class NodeVisitor;
18 class RegExpCompiler;
19 class RegExpMacroAssembler;
20 class RegExpNode;
21 class RegExpTree;
22 class BoyerMooreLookahead;
23 
24 inline bool IgnoreCase(JSRegExp::Flags flags) {
25  return (flags & JSRegExp::kIgnoreCase) != 0;
26 }
27 
28 inline bool IsUnicode(JSRegExp::Flags flags) {
29  return (flags & JSRegExp::kUnicode) != 0;
30 }
31 
32 inline bool IsSticky(JSRegExp::Flags flags) {
33  return (flags & JSRegExp::kSticky) != 0;
34 }
35 
36 inline bool IsGlobal(JSRegExp::Flags flags) {
37  return (flags & JSRegExp::kGlobal) != 0;
38 }
39 
40 inline bool DotAll(JSRegExp::Flags flags) {
41  return (flags & JSRegExp::kDotAll) != 0;
42 }
43 
44 inline bool Multiline(JSRegExp::Flags flags) {
45  return (flags & JSRegExp::kMultiline) != 0;
46 }
47 
48 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
49  // Both unicode and ignore_case flags are set. We need to use ICU to find
50  // the closure over case equivalents.
51  return IsUnicode(flags) && IgnoreCase(flags);
52 }
53 
54 class RegExpImpl {
55  public:
56  // Whether V8 is compiled with native regexp support or not.
57  static bool UsesNativeRegExp() {
58 #ifdef V8_INTERPRETED_REGEXP
59  return false;
60 #else
61  return true;
62 #endif
63  }
64 
65  // Returns a string representation of a regular expression.
66  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
67  // This function calls the garbage collector if necessary.
68  static Handle<String> ToString(Handle<Object> value);
69 
70  // Parses the RegExp pattern and prepares the JSRegExp object with
71  // generic data and choice of implementation - as well as what
72  // the implementation wants to store in the data field.
73  // Returns false if compilation fails.
74  V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
75  Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
76  JSRegExp::Flags flags);
77 
78  // See ECMA-262 section 15.10.6.2.
79  // This function calls the garbage collector if necessary.
80  V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
81  Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
82  int index, Handle<RegExpMatchInfo> last_match_info);
83 
84  // Prepares a JSRegExp object with Irregexp-specific data.
85  static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
86  Handle<String> pattern, JSRegExp::Flags flags,
87  int capture_register_count);
88 
89  static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
90  Handle<String> pattern, JSRegExp::Flags flags,
91  Handle<String> match_pattern);
92 
93  static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
94  Handle<String> subject, int index, int32_t* output,
95  int output_size);
96 
97  static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
98  Handle<String> subject, int index,
99  Handle<RegExpMatchInfo> last_match_info);
100 
101  enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
102 
103  // Prepare a RegExp for being executed one or more times (using
104  // IrregexpExecOnce) on the subject.
105  // This ensures that the regexp is compiled for the subject, and that
106  // the subject is flat.
107  // Returns the number of integer spaces required by IrregexpExecOnce
108  // as its "registers" argument. If the regexp cannot be compiled,
109  // an exception is set as pending, and this function returns negative.
110  static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
111  Handle<String> subject);
112 
113  // Execute a regular expression on the subject, starting from index.
114  // If matching succeeds, return the number of matches. This can be larger
115  // than one in the case of global regular expressions.
116  // The captures and subcaptures are stored into the registers vector.
117  // If matching fails, returns RE_FAILURE.
118  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
119  static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
120  Handle<String> subject, int index, int32_t* output,
121  int output_size);
122 
123  // Execute an Irregexp bytecode pattern.
124  // On a successful match, the result is a JSArray containing
125  // captured positions. On a failure, the result is the null value.
126  // Returns an empty handle in case of an exception.
127  V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
128  Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
129  int index, Handle<RegExpMatchInfo> last_match_info);
130 
131  // Set last match info. If match is nullptr, then setting captures is
132  // omitted.
133  static Handle<RegExpMatchInfo> SetLastMatchInfo(
134  Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
135  Handle<String> subject, int capture_count, int32_t* match);
136 
137  class GlobalCache {
138  public:
140  Handle<String> subject,
141  Isolate* isolate);
142 
143  V8_INLINE ~GlobalCache();
144 
145  // Fetch the next entry in the cache for global regexp match results.
146  // This does not set the last match info. Upon failure, nullptr is
147  // returned. The cause can be checked with Result(). The previous result is
148  // still in available in memory when a failure happens.
149  V8_INLINE int32_t* FetchNext();
150 
151  V8_INLINE int32_t* LastSuccessfulMatch();
152 
153  V8_INLINE bool HasException() { return num_matches_ < 0; }
154 
155  private:
156  int AdvanceZeroLength(int last_index);
157 
158  int num_matches_;
159  int max_matches_;
160  int current_match_index_;
161  int registers_per_match_;
162  // Pointer to the last set of captures.
163  int32_t* register_array_;
164  int register_array_size_;
165  Handle<JSRegExp> regexp_;
166  Handle<String> subject_;
167  Isolate* isolate_;
168  };
169 
170  // For acting on the JSRegExp data FixedArray.
171  static int IrregexpMaxRegisterCount(FixedArray re);
172  static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
173  static void SetIrregexpCaptureNameMap(FixedArray re,
174  Handle<FixedArray> value);
175  static int IrregexpNumberOfCaptures(FixedArray re);
176  static int IrregexpNumberOfRegisters(FixedArray re);
177  static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
178  static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
179 
180  // Limit the space regexps take up on the heap. In order to limit this we
181  // would like to keep track of the amount of regexp code on the heap. This
182  // is not tracked, however. As a conservative approximation we track the
183  // total regexp code compiled including code that has subsequently been freed
184  // and the total executable memory at any point.
185  static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
186  static const size_t kRegExpCompiledLimit = 1 * MB;
187  static const int kRegExpTooLargeToOptimize = 20 * KB;
188 
189  private:
190  static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
191  Handle<String> sample_subject, bool is_one_byte);
192  static inline bool EnsureCompiledIrregexp(Isolate* isolate,
193  Handle<JSRegExp> re,
194  Handle<String> sample_subject,
195  bool is_one_byte);
196 };
197 
198 
199 // Represents the location of one element relative to the intersection of
200 // two sets. Corresponds to the four areas of a Venn diagram.
201 enum ElementInSetsRelation {
202  kInsideNone = 0,
203  kInsideFirst = 1,
204  kInsideSecond = 2,
205  kInsideBoth = 3
206 };
207 
208 
209 // A set of unsigned integers that behaves especially well on small
210 // integers (< 32). May do zone-allocation.
211 class OutSet: public ZoneObject {
212  public:
213  OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
214  OutSet* Extend(unsigned value, Zone* zone);
215  bool Get(unsigned value) const;
216  static const unsigned kFirstLimit = 32;
217 
218  private:
219  // Destructively set a value in this set. In most cases you want
220  // to use Extend instead to ensure that only one instance exists
221  // that contains the same values.
222  void Set(unsigned value, Zone* zone);
223 
224  // The successors are a list of sets that contain the same values
225  // as this set and the one more value that is not present in this
226  // set.
227  ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
228 
229  OutSet(uint32_t first, ZoneList<unsigned>* remaining)
230  : first_(first), remaining_(remaining), successors_(nullptr) {}
231  uint32_t first_;
232  ZoneList<unsigned>* remaining_;
233  ZoneList<OutSet*>* successors_;
234  friend class Trace;
235 };
236 
237 
238 // A mapping from integers, specified as ranges, to a set of integers.
239 // Used for mapping character ranges to choices.
240 class DispatchTable : public ZoneObject {
241  public:
242  explicit DispatchTable(Zone* zone) : tree_(zone) { }
243 
244  class Entry {
245  public:
246  Entry() : from_(0), to_(0), out_set_(nullptr) {}
247  Entry(uc32 from, uc32 to, OutSet* out_set)
248  : from_(from), to_(to), out_set_(out_set) {
249  DCHECK(from <= to);
250  }
251  uc32 from() { return from_; }
252  uc32 to() { return to_; }
253  void set_to(uc32 value) { to_ = value; }
254  void AddValue(int value, Zone* zone) {
255  out_set_ = out_set_->Extend(value, zone);
256  }
257  OutSet* out_set() { return out_set_; }
258  private:
259  uc32 from_;
260  uc32 to_;
261  OutSet* out_set_;
262  };
263 
264  class Config {
265  public:
266  typedef uc32 Key;
267  typedef Entry Value;
268  static const uc32 kNoKey;
269  static const Entry NoValue() { return Value(); }
270  static inline int Compare(uc32 a, uc32 b) {
271  if (a == b)
272  return 0;
273  else if (a < b)
274  return -1;
275  else
276  return 1;
277  }
278  };
279 
280  void AddRange(CharacterRange range, int value, Zone* zone);
281  OutSet* Get(uc32 value);
282  void Dump();
283 
284  template <typename Callback>
285  void ForEach(Callback* callback) {
286  return tree()->ForEach(callback);
287  }
288 
289  private:
290  // There can't be a static empty set since it allocates its
291  // successors in a zone and caches them.
292  OutSet* empty() { return &empty_; }
293  OutSet empty_;
294  ZoneSplayTree<Config>* tree() { return &tree_; }
295  ZoneSplayTree<Config> tree_;
296 };
297 
298 
299 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
301  public:
303  void Call(uc32 from, DispatchTable::Entry entry);
304 
305  ZoneList<CharacterRange>* bmp() { return bmp_; }
306  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
307  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
308  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
309 
310  private:
311  static const int kBase = 0;
312  // Separate ranges into
313  static const int kBmpCodePoints = 1;
314  static const int kLeadSurrogates = 2;
315  static const int kTrailSurrogates = 3;
316  static const int kNonBmpCodePoints = 4;
317 
318  Zone* zone_;
319  DispatchTable table_;
321  ZoneList<CharacterRange>* lead_surrogates_;
322  ZoneList<CharacterRange>* trail_surrogates_;
323  ZoneList<CharacterRange>* non_bmp_;
324 };
325 
326 
327 #define FOR_EACH_NODE_TYPE(VISIT) \
328  VISIT(End) \
329  VISIT(Action) \
330  VISIT(Choice) \
331  VISIT(BackReference) \
332  VISIT(Assertion) \
333  VISIT(Text)
334 
335 
336 class Trace;
337 struct PreloadState;
338 class GreedyLoopState;
340 
341 struct NodeInfo {
342  NodeInfo()
343  : being_analyzed(false),
344  been_analyzed(false),
345  follows_word_interest(false),
346  follows_newline_interest(false),
347  follows_start_interest(false),
348  at_end(false),
349  visited(false),
350  replacement_calculated(false) { }
351 
352  // Returns true if the interests and assumptions of this node
353  // matches the given one.
354  bool Matches(NodeInfo* that) {
355  return (at_end == that->at_end) &&
356  (follows_word_interest == that->follows_word_interest) &&
357  (follows_newline_interest == that->follows_newline_interest) &&
358  (follows_start_interest == that->follows_start_interest);
359  }
360 
361  // Updates the interests of this node given the interests of the
362  // node preceding it.
363  void AddFromPreceding(NodeInfo* that) {
364  at_end |= that->at_end;
365  follows_word_interest |= that->follows_word_interest;
366  follows_newline_interest |= that->follows_newline_interest;
367  follows_start_interest |= that->follows_start_interest;
368  }
369 
370  bool HasLookbehind() {
371  return follows_word_interest ||
372  follows_newline_interest ||
373  follows_start_interest;
374  }
375 
376  // Sets the interests of this node to include the interests of the
377  // following node.
378  void AddFromFollowing(NodeInfo* that) {
379  follows_word_interest |= that->follows_word_interest;
380  follows_newline_interest |= that->follows_newline_interest;
381  follows_start_interest |= that->follows_start_interest;
382  }
383 
384  void ResetCompilationState() {
385  being_analyzed = false;
386  been_analyzed = false;
387  }
388 
389  bool being_analyzed: 1;
390  bool been_analyzed: 1;
391 
392  // These bits are set of this node has to know what the preceding
393  // character was.
394  bool follows_word_interest: 1;
395  bool follows_newline_interest: 1;
396  bool follows_start_interest: 1;
397 
398  bool at_end: 1;
399  bool visited: 1;
400  bool replacement_calculated: 1;
401 };
402 
403 
404 // Details of a quick mask-compare check that can look ahead in the
405 // input stream.
407  public:
409  : characters_(0),
410  mask_(0),
411  value_(0),
412  cannot_match_(false) { }
413  explicit QuickCheckDetails(int characters)
414  : characters_(characters),
415  mask_(0),
416  value_(0),
417  cannot_match_(false) { }
418  bool Rationalize(bool one_byte);
419  // Merge in the information from another branch of an alternation.
420  void Merge(QuickCheckDetails* other, int from_index);
421  // Advance the current position by some amount.
422  void Advance(int by, bool one_byte);
423  void Clear();
424  bool cannot_match() { return cannot_match_; }
425  void set_cannot_match() { cannot_match_ = true; }
426  struct Position {
427  Position() : mask(0), value(0), determines_perfectly(false) { }
428  uc16 mask;
429  uc16 value;
430  bool determines_perfectly;
431  };
432  int characters() { return characters_; }
433  void set_characters(int characters) { characters_ = characters; }
434  Position* positions(int index) {
435  DCHECK_LE(0, index);
436  DCHECK_GT(characters_, index);
437  return positions_ + index;
438  }
439  uint32_t mask() { return mask_; }
440  uint32_t value() { return value_; }
441 
442  private:
443  // How many characters do we have quick check information from. This is
444  // the same for all branches of a choice node.
445  int characters_;
446  Position positions_[4];
447  // These values are the condensate of the above array after Rationalize().
448  uint32_t mask_;
449  uint32_t value_;
450  // If set to true, there is no way this quick check can match at all.
451  // E.g., if it requires to be at the start of the input, and isn't.
452  bool cannot_match_;
453 };
454 
455 
456 extern int kUninitializedRegExpNodePlaceHolder;
457 
458 
459 class RegExpNode: public ZoneObject {
460  public:
461  explicit RegExpNode(Zone* zone)
462  : replacement_(nullptr),
463  on_work_list_(false),
464  trace_count_(0),
465  zone_(zone) {
466  bm_info_[0] = bm_info_[1] = nullptr;
467  }
468  virtual ~RegExpNode();
469  virtual void Accept(NodeVisitor* visitor) = 0;
470  // Generates a goto to this node or actually generates the code at this point.
471  virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
472  // How many characters must this node consume at a minimum in order to
473  // succeed. If we have found at least 'still_to_find' characters that
474  // must be consumed there is no need to ask any following nodes whether
475  // they are sure to eat any more characters. The not_at_start argument is
476  // used to indicate that we know we are not at the start of the input. In
477  // this case anchored branches will always fail and can be ignored when
478  // determining how many characters are consumed on success.
479  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
480  // Emits some quick code that checks whether the preloaded characters match.
481  // Falls through on certain failure, jumps to the label on possible success.
482  // If the node cannot make a quick check it does nothing and returns false.
483  bool EmitQuickCheck(RegExpCompiler* compiler,
484  Trace* bounds_check_trace,
485  Trace* trace,
486  bool preload_has_checked_bounds,
487  Label* on_possible_success,
488  QuickCheckDetails* details_return,
489  bool fall_through_on_failure);
490  // For a given number of characters this returns a mask and a value. The
491  // next n characters are anded with the mask and compared with the value.
492  // A comparison failure indicates the node cannot match the next n characters.
493  // A comparison success indicates the node may match.
494  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
495  RegExpCompiler* compiler,
496  int characters_filled_in,
497  bool not_at_start) = 0;
498  static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
499  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
500  // Only returns the successor for a text node of length 1 that matches any
501  // character and that has no guards on it.
502  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
503  RegExpCompiler* compiler) {
504  return nullptr;
505  }
506 
507  // Collects information on the possible code units (mod 128) that can match if
508  // we look forward. This is used for a Boyer-Moore-like string searching
509  // implementation. TODO(erikcorry): This should share more code with
510  // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
511  // the number of nodes we are willing to look at in order to create this data.
512  static const int kRecursionBudget = 200;
513  bool KeepRecursing(RegExpCompiler* compiler);
514  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
515  BoyerMooreLookahead* bm, bool not_at_start) {
516  UNREACHABLE();
517  }
518 
519  // If we know that the input is one-byte then there are some nodes that can
520  // never match. This method returns a node that can be substituted for
521  // itself, or nullptr if the node can never match.
522  virtual RegExpNode* FilterOneByte(int depth) { return this; }
523  // Helper for FilterOneByte.
524  RegExpNode* replacement() {
525  DCHECK(info()->replacement_calculated);
526  return replacement_;
527  }
528  RegExpNode* set_replacement(RegExpNode* replacement) {
529  info()->replacement_calculated = true;
530  replacement_ = replacement;
531  return replacement; // For convenience.
532  }
533 
534  // We want to avoid recalculating the lookahead info, so we store it on the
535  // node. Only info that is for this node is stored. We can tell that the
536  // info is for this node when offset == 0, so the information is calculated
537  // relative to this node.
538  void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
539  if (offset == 0) set_bm_info(not_at_start, bm);
540  }
541 
542  Label* label() { return &label_; }
543  // If non-generic code is generated for a node (i.e. the node is not at the
544  // start of the trace) then it cannot be reused. This variable sets a limit
545  // on how often we allow that to happen before we insist on starting a new
546  // trace and generating generic code for a node that can be reused by flushing
547  // the deferred actions in the current trace and generating a goto.
548  static const int kMaxCopiesCodeGenerated = 10;
549 
550  bool on_work_list() { return on_work_list_; }
551  void set_on_work_list(bool value) { on_work_list_ = value; }
552 
553  NodeInfo* info() { return &info_; }
554 
555  BoyerMooreLookahead* bm_info(bool not_at_start) {
556  return bm_info_[not_at_start ? 1 : 0];
557  }
558 
559  Zone* zone() const { return zone_; }
560 
561  protected:
562  enum LimitResult { DONE, CONTINUE };
563  RegExpNode* replacement_;
564 
565  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
566 
567  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
568  bm_info_[not_at_start ? 1 : 0] = bm;
569  }
570 
571  private:
572  static const int kFirstCharBudget = 10;
573  Label label_;
574  bool on_work_list_;
575  NodeInfo info_;
576  // This variable keeps track of how many times code has been generated for
577  // this node (in different traces). We don't keep track of where the
578  // generated code is located unless the code is generated at the start of
579  // a trace, in which case it is generic and can be reused by flushing the
580  // deferred operations in the current trace and generating a goto.
581  int trace_count_;
582  BoyerMooreLookahead* bm_info_[2];
583 
584  Zone* zone_;
585 };
586 
587 
588 class SeqRegExpNode: public RegExpNode {
589  public:
590  explicit SeqRegExpNode(RegExpNode* on_success)
591  : RegExpNode(on_success->zone()), on_success_(on_success) { }
592  RegExpNode* on_success() { return on_success_; }
593  void set_on_success(RegExpNode* node) { on_success_ = node; }
594  RegExpNode* FilterOneByte(int depth) override;
595  void FillInBMInfo(Isolate* isolate, int offset, int budget,
596  BoyerMooreLookahead* bm, bool not_at_start) override {
597  on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
598  if (offset == 0) set_bm_info(not_at_start, bm);
599  }
600 
601  protected:
602  RegExpNode* FilterSuccessor(int depth);
603 
604  private:
605  RegExpNode* on_success_;
606 };
607 
608 
609 class ActionNode: public SeqRegExpNode {
610  public:
611  enum ActionType {
612  SET_REGISTER,
613  INCREMENT_REGISTER,
614  STORE_POSITION,
615  BEGIN_SUBMATCH,
616  POSITIVE_SUBMATCH_SUCCESS,
617  EMPTY_MATCH_CHECK,
618  CLEAR_CAPTURES
619  };
620  static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
621  static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
622  static ActionNode* StorePosition(int reg,
623  bool is_capture,
624  RegExpNode* on_success);
625  static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
626  static ActionNode* BeginSubmatch(int stack_pointer_reg,
627  int position_reg,
628  RegExpNode* on_success);
629  static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
630  int restore_reg,
631  int clear_capture_count,
632  int clear_capture_from,
633  RegExpNode* on_success);
634  static ActionNode* EmptyMatchCheck(int start_register,
635  int repetition_register,
636  int repetition_limit,
637  RegExpNode* on_success);
638  void Accept(NodeVisitor* visitor) override;
639  void Emit(RegExpCompiler* compiler, Trace* trace) override;
640  int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
641  void GetQuickCheckDetails(QuickCheckDetails* details,
642  RegExpCompiler* compiler, int filled_in,
643  bool not_at_start) override {
644  return on_success()->GetQuickCheckDetails(
645  details, compiler, filled_in, not_at_start);
646  }
647  void FillInBMInfo(Isolate* isolate, int offset, int budget,
648  BoyerMooreLookahead* bm, bool not_at_start) override;
649  ActionType action_type() { return action_type_; }
650  // TODO(erikcorry): We should allow some action nodes in greedy loops.
651  int GreedyLoopTextLength() override {
652  return kNodeIsTooComplexForGreedyLoops;
653  }
654 
655  private:
656  union {
657  struct {
658  int reg;
659  int value;
660  } u_store_register;
661  struct {
662  int reg;
663  } u_increment_register;
664  struct {
665  int reg;
666  bool is_capture;
667  } u_position_register;
668  struct {
669  int stack_pointer_register;
670  int current_position_register;
671  int clear_register_count;
672  int clear_register_from;
673  } u_submatch;
674  struct {
675  int start_register;
676  int repetition_register;
677  int repetition_limit;
678  } u_empty_match_check;
679  struct {
680  int range_from;
681  int range_to;
682  } u_clear_captures;
683  } data_;
684  ActionNode(ActionType action_type, RegExpNode* on_success)
685  : SeqRegExpNode(on_success),
686  action_type_(action_type) { }
687  ActionType action_type_;
688  friend class DotPrinter;
689 };
690 
691 
692 class TextNode: public SeqRegExpNode {
693  public:
694  TextNode(ZoneList<TextElement>* elms, bool read_backward,
695  RegExpNode* on_success)
696  : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
697  TextNode(RegExpCharacterClass* that, bool read_backward,
698  RegExpNode* on_success)
699  : SeqRegExpNode(on_success),
700  elms_(new (zone()) ZoneList<TextElement>(1, zone())),
701  read_backward_(read_backward) {
702  elms_->Add(TextElement::CharClass(that), zone());
703  }
704  // Create TextNode for a single character class for the given ranges.
705  static TextNode* CreateForCharacterRanges(Zone* zone,
706  ZoneList<CharacterRange>* ranges,
707  bool read_backward,
708  RegExpNode* on_success,
709  JSRegExp::Flags flags);
710  // Create TextNode for a surrogate pair with a range given for the
711  // lead and the trail surrogate each.
712  static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
713  CharacterRange trail,
714  bool read_backward,
715  RegExpNode* on_success,
716  JSRegExp::Flags flags);
717  void Accept(NodeVisitor* visitor) override;
718  void Emit(RegExpCompiler* compiler, Trace* trace) override;
719  int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
720  void GetQuickCheckDetails(QuickCheckDetails* details,
721  RegExpCompiler* compiler, int characters_filled_in,
722  bool not_at_start) override;
723  ZoneList<TextElement>* elements() { return elms_; }
724  bool read_backward() { return read_backward_; }
725  void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
726  int GreedyLoopTextLength() override;
727  RegExpNode* GetSuccessorOfOmnivorousTextNode(
728  RegExpCompiler* compiler) override;
729  void FillInBMInfo(Isolate* isolate, int offset, int budget,
730  BoyerMooreLookahead* bm, bool not_at_start) override;
731  void CalculateOffsets();
732  RegExpNode* FilterOneByte(int depth) override;
733 
734  private:
735  enum TextEmitPassType {
736  NON_LATIN1_MATCH, // Check for characters that can't match.
737  SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
738  NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
739  CASE_CHARACTER_MATCH, // Case-independent single character check.
740  CHARACTER_CLASS_MATCH // Character class.
741  };
742  static bool SkipPass(TextEmitPassType pass, bool ignore_case);
743  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
744  static const int kLastPass = CHARACTER_CLASS_MATCH;
745  void TextEmitPass(RegExpCompiler* compiler,
746  TextEmitPassType pass,
747  bool preloaded,
748  Trace* trace,
749  bool first_element_checked,
750  int* checked_up_to);
751  int Length();
752  ZoneList<TextElement>* elms_;
753  bool read_backward_;
754 };
755 
756 
758  public:
759  enum AssertionType {
760  AT_END,
761  AT_START,
762  AT_BOUNDARY,
763  AT_NON_BOUNDARY,
764  AFTER_NEWLINE
765  };
766  static AssertionNode* AtEnd(RegExpNode* on_success) {
767  return new(on_success->zone()) AssertionNode(AT_END, on_success);
768  }
769  static AssertionNode* AtStart(RegExpNode* on_success) {
770  return new(on_success->zone()) AssertionNode(AT_START, on_success);
771  }
772  static AssertionNode* AtBoundary(RegExpNode* on_success) {
773  return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
774  }
775  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
776  return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
777  }
778  static AssertionNode* AfterNewline(RegExpNode* on_success) {
779  return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
780  }
781  void Accept(NodeVisitor* visitor) override;
782  void Emit(RegExpCompiler* compiler, Trace* trace) override;
783  int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
784  void GetQuickCheckDetails(QuickCheckDetails* details,
785  RegExpCompiler* compiler, int filled_in,
786  bool not_at_start) override;
787  void FillInBMInfo(Isolate* isolate, int offset, int budget,
788  BoyerMooreLookahead* bm, bool not_at_start) override;
789  AssertionType assertion_type() { return assertion_type_; }
790 
791  private:
792  void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
793  enum IfPrevious { kIsNonWord, kIsWord };
794  void BacktrackIfPrevious(RegExpCompiler* compiler,
795  Trace* trace,
796  IfPrevious backtrack_if_previous);
797  AssertionNode(AssertionType t, RegExpNode* on_success)
798  : SeqRegExpNode(on_success), assertion_type_(t) { }
799  AssertionType assertion_type_;
800 };
801 
802 
804  public:
805  BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
806  bool read_backward, RegExpNode* on_success)
807  : SeqRegExpNode(on_success),
808  start_reg_(start_reg),
809  end_reg_(end_reg),
810  flags_(flags),
811  read_backward_(read_backward) {}
812  void Accept(NodeVisitor* visitor) override;
813  int start_register() { return start_reg_; }
814  int end_register() { return end_reg_; }
815  bool read_backward() { return read_backward_; }
816  void Emit(RegExpCompiler* compiler, Trace* trace) override;
817  int EatsAtLeast(int still_to_find, int recursion_depth,
818  bool not_at_start) override;
819  void GetQuickCheckDetails(QuickCheckDetails* details,
820  RegExpCompiler* compiler, int characters_filled_in,
821  bool not_at_start) override {
822  return;
823  }
824  void FillInBMInfo(Isolate* isolate, int offset, int budget,
825  BoyerMooreLookahead* bm, bool not_at_start) override;
826 
827  private:
828  int start_reg_;
829  int end_reg_;
830  JSRegExp::Flags flags_;
831  bool read_backward_;
832 };
833 
834 
835 class EndNode: public RegExpNode {
836  public:
837  enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
838  EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
839  void Accept(NodeVisitor* visitor) override;
840  void Emit(RegExpCompiler* compiler, Trace* trace) override;
841  int EatsAtLeast(int still_to_find, int recursion_depth,
842  bool not_at_start) override {
843  return 0;
844  }
845  void GetQuickCheckDetails(QuickCheckDetails* details,
846  RegExpCompiler* compiler, int characters_filled_in,
847  bool not_at_start) override {
848  // Returning 0 from EatsAtLeast should ensure we never get here.
849  UNREACHABLE();
850  }
851  void FillInBMInfo(Isolate* isolate, int offset, int budget,
852  BoyerMooreLookahead* bm, bool not_at_start) override {
853  // Returning 0 from EatsAtLeast should ensure we never get here.
854  UNREACHABLE();
855  }
856 
857  private:
858  Action action_;
859 };
860 
861 
863  public:
864  NegativeSubmatchSuccess(int stack_pointer_reg,
865  int position_reg,
866  int clear_capture_count,
867  int clear_capture_start,
868  Zone* zone)
869  : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
870  stack_pointer_register_(stack_pointer_reg),
871  current_position_register_(position_reg),
872  clear_capture_count_(clear_capture_count),
873  clear_capture_start_(clear_capture_start) { }
874  void Emit(RegExpCompiler* compiler, Trace* trace) override;
875 
876  private:
877  int stack_pointer_register_;
878  int current_position_register_;
879  int clear_capture_count_;
880  int clear_capture_start_;
881 };
882 
883 
884 class Guard: public ZoneObject {
885  public:
886  enum Relation { LT, GEQ };
887  Guard(int reg, Relation op, int value)
888  : reg_(reg),
889  op_(op),
890  value_(value) { }
891  int reg() { return reg_; }
892  Relation op() { return op_; }
893  int value() { return value_; }
894 
895  private:
896  int reg_;
897  Relation op_;
898  int value_;
899 };
900 
901 
903  public:
904  explicit GuardedAlternative(RegExpNode* node)
905  : node_(node), guards_(nullptr) {}
906  void AddGuard(Guard* guard, Zone* zone);
907  RegExpNode* node() { return node_; }
908  void set_node(RegExpNode* node) { node_ = node; }
909  ZoneList<Guard*>* guards() { return guards_; }
910 
911  private:
912  RegExpNode* node_;
913  ZoneList<Guard*>* guards_;
914 };
915 
916 
918 
919 
920 class ChoiceNode: public RegExpNode {
921  public:
922  explicit ChoiceNode(int expected_size, Zone* zone)
923  : RegExpNode(zone),
924  alternatives_(new (zone)
925  ZoneList<GuardedAlternative>(expected_size, zone)),
926  table_(nullptr),
927  not_at_start_(false),
928  being_calculated_(false) {}
929  void Accept(NodeVisitor* visitor) override;
930  void AddAlternative(GuardedAlternative node) {
931  alternatives()->Add(node, zone());
932  }
933  ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
934  DispatchTable* GetTable(bool ignore_case);
935  void Emit(RegExpCompiler* compiler, Trace* trace) override;
936  int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
937  int EatsAtLeastHelper(int still_to_find,
938  int budget,
939  RegExpNode* ignore_this_node,
940  bool not_at_start);
941  void GetQuickCheckDetails(QuickCheckDetails* details,
942  RegExpCompiler* compiler, int characters_filled_in,
943  bool not_at_start) override;
944  void FillInBMInfo(Isolate* isolate, int offset, int budget,
945  BoyerMooreLookahead* bm, bool not_at_start) override;
946 
947  bool being_calculated() { return being_calculated_; }
948  bool not_at_start() { return not_at_start_; }
949  void set_not_at_start() { not_at_start_ = true; }
950  void set_being_calculated(bool b) { being_calculated_ = b; }
951  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
952  return true;
953  }
954  RegExpNode* FilterOneByte(int depth) override;
955  virtual bool read_backward() { return false; }
956 
957  protected:
958  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
959  ZoneList<GuardedAlternative>* alternatives_;
960 
961  private:
962  friend class DispatchTableConstructor;
963  friend class Analysis;
964  void GenerateGuard(RegExpMacroAssembler* macro_assembler,
965  Guard* guard,
966  Trace* trace);
967  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
968  void EmitOutOfLineContinuation(RegExpCompiler* compiler,
969  Trace* trace,
970  GuardedAlternative alternative,
971  AlternativeGeneration* alt_gen,
972  int preload_characters,
973  bool next_expects_preload);
974  void SetUpPreLoad(RegExpCompiler* compiler,
975  Trace* current_trace,
976  PreloadState* preloads);
977  void AssertGuardsMentionRegisters(Trace* trace);
978  int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
979  Trace* EmitGreedyLoop(RegExpCompiler* compiler,
980  Trace* trace,
981  AlternativeGenerationList* alt_gens,
982  PreloadState* preloads,
983  GreedyLoopState* greedy_loop_state,
984  int text_length);
985  void EmitChoices(RegExpCompiler* compiler,
986  AlternativeGenerationList* alt_gens,
987  int first_choice,
988  Trace* trace,
989  PreloadState* preloads);
990  DispatchTable* table_;
991  // If true, this node is never checked at the start of the input.
992  // Allows a new trace to start with at_start() set to false.
993  bool not_at_start_;
994  bool being_calculated_;
995 };
996 
997 
999  public:
1000  explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
1001  GuardedAlternative then_do_this,
1002  Zone* zone)
1003  : ChoiceNode(2, zone) {
1004  AddAlternative(this_must_fail);
1005  AddAlternative(then_do_this);
1006  }
1007  int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1008  void GetQuickCheckDetails(QuickCheckDetails* details,
1009  RegExpCompiler* compiler, int characters_filled_in,
1010  bool not_at_start) override;
1011  void FillInBMInfo(Isolate* isolate, int offset, int budget,
1012  BoyerMooreLookahead* bm, bool not_at_start) override {
1013  alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
1014  not_at_start);
1015  if (offset == 0) set_bm_info(not_at_start, bm);
1016  }
1017  // For a negative lookahead we don't emit the quick check for the
1018  // alternative that is expected to fail. This is because quick check code
1019  // starts by loading enough characters for the alternative that takes fewest
1020  // characters, but on a negative lookahead the negative branch did not take
1021  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1022  bool try_to_emit_quick_check_for_alternative(bool is_first) override {
1023  return !is_first;
1024  }
1025  RegExpNode* FilterOneByte(int depth) override;
1026 };
1027 
1028 
1030  public:
1031  LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1032  : ChoiceNode(2, zone),
1033  loop_node_(nullptr),
1034  continue_node_(nullptr),
1035  body_can_be_zero_length_(body_can_be_zero_length),
1036  read_backward_(read_backward) {}
1037  void AddLoopAlternative(GuardedAlternative alt);
1038  void AddContinueAlternative(GuardedAlternative alt);
1039  void Emit(RegExpCompiler* compiler, Trace* trace) override;
1040  int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1041  void GetQuickCheckDetails(QuickCheckDetails* details,
1042  RegExpCompiler* compiler, int characters_filled_in,
1043  bool not_at_start) override;
1044  void FillInBMInfo(Isolate* isolate, int offset, int budget,
1045  BoyerMooreLookahead* bm, bool not_at_start) override;
1046  RegExpNode* loop_node() { return loop_node_; }
1047  RegExpNode* continue_node() { return continue_node_; }
1048  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1049  bool read_backward() override { return read_backward_; }
1050  void Accept(NodeVisitor* visitor) override;
1051  RegExpNode* FilterOneByte(int depth) override;
1052 
1053  private:
1054  // AddAlternative is made private for loop nodes because alternatives
1055  // should not be added freely, we need to keep track of which node
1056  // goes back to the node itself.
1057  void AddAlternative(GuardedAlternative node) {
1058  ChoiceNode::AddAlternative(node);
1059  }
1060 
1061  RegExpNode* loop_node_;
1062  RegExpNode* continue_node_;
1063  bool body_can_be_zero_length_;
1064  bool read_backward_;
1065 };
1066 
1067 
1068 // Improve the speed that we scan for an initial point where a non-anchored
1069 // regexp can match by using a Boyer-Moore-like table. This is done by
1070 // identifying non-greedy non-capturing loops in the nodes that eat any
1071 // character one at a time. For example in the middle of the regexp
1072 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1073 // inserted at the start of any non-anchored regexp.
1074 //
1075 // When we have found such a loop we look ahead in the nodes to find the set of
1076 // characters that can come at given distances. For example for the regexp
1077 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1078 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1079 // the lookahead info where the set of characters is reasonably constrained. In
1080 // our example this is from index 1 to 2 (0 is not constrained). We can now
1081 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1082 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1083 //
1084 // For Unicode input strings we do the same, but modulo 128.
1085 //
1086 // We also look at the first string fed to the regexp and use that to get a hint
1087 // of the character frequencies in the inputs. This affects the assessment of
1088 // whether the set of characters is 'reasonably constrained'.
1089 //
1090 // We also have another lookahead mechanism (called quick check in the code),
1091 // which uses a wide load of multiple characters followed by a mask and compare
1092 // to determine whether a match is possible at this point.
1093 enum ContainedInLattice {
1094  kNotYet = 0,
1095  kLatticeIn = 1,
1096  kLatticeOut = 2,
1097  kLatticeUnknown = 3 // Can also mean both in and out.
1098 };
1099 
1100 
1101 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1102  return static_cast<ContainedInLattice>(a | b);
1103 }
1104 
1105 
1106 ContainedInLattice AddRange(ContainedInLattice a,
1107  const int* ranges,
1108  int ranges_size,
1109  Interval new_range);
1110 
1111 
1113  public:
1114  explicit BoyerMoorePositionInfo(Zone* zone)
1115  : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1116  map_count_(0),
1117  w_(kNotYet),
1118  s_(kNotYet),
1119  d_(kNotYet),
1120  surrogate_(kNotYet) {
1121  for (int i = 0; i < kMapSize; i++) {
1122  map_->Add(false, zone);
1123  }
1124  }
1125 
1126  bool& at(int i) { return map_->at(i); }
1127 
1128  static const int kMapSize = 128;
1129  static const int kMask = kMapSize - 1;
1130 
1131  int map_count() const { return map_count_; }
1132 
1133  void Set(int character);
1134  void SetInterval(const Interval& interval);
1135  void SetAll();
1136  bool is_non_word() { return w_ == kLatticeOut; }
1137  bool is_word() { return w_ == kLatticeIn; }
1138 
1139  private:
1140  ZoneList<bool>* map_;
1141  int map_count_; // Number of set bits in the map.
1142  ContainedInLattice w_; // The \w character class.
1143  ContainedInLattice s_; // The \s character class.
1144  ContainedInLattice d_; // The \d character class.
1145  ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1146 };
1147 
1148 
1150  public:
1151  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1152 
1153  int length() { return length_; }
1154  int max_char() { return max_char_; }
1155  RegExpCompiler* compiler() { return compiler_; }
1156 
1157  int Count(int map_number) {
1158  return bitmaps_->at(map_number)->map_count();
1159  }
1160 
1161  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1162 
1163  void Set(int map_number, int character) {
1164  if (character > max_char_) return;
1165  BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1166  info->Set(character);
1167  }
1168 
1169  void SetInterval(int map_number, const Interval& interval) {
1170  if (interval.from() > max_char_) return;
1171  BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1172  if (interval.to() > max_char_) {
1173  info->SetInterval(Interval(interval.from(), max_char_));
1174  } else {
1175  info->SetInterval(interval);
1176  }
1177  }
1178 
1179  void SetAll(int map_number) {
1180  bitmaps_->at(map_number)->SetAll();
1181  }
1182 
1183  void SetRest(int from_map) {
1184  for (int i = from_map; i < length_; i++) SetAll(i);
1185  }
1186  void EmitSkipInstructions(RegExpMacroAssembler* masm);
1187 
1188  private:
1189  // This is the value obtained by EatsAtLeast. If we do not have at least this
1190  // many characters left in the sample string then the match is bound to fail.
1191  // Therefore it is OK to read a character this far ahead of the current match
1192  // point.
1193  int length_;
1194  RegExpCompiler* compiler_;
1195  // 0xff for Latin1, 0xffff for UTF-16.
1196  int max_char_;
1198 
1199  int GetSkipTable(int min_lookahead,
1200  int max_lookahead,
1201  Handle<ByteArray> boolean_skip_table);
1202  bool FindWorthwhileInterval(int* from, int* to);
1203  int FindBestInterval(
1204  int max_number_of_chars, int old_biggest_points, int* from, int* to);
1205 };
1206 
1207 
1208 // There are many ways to generate code for a node. This class encapsulates
1209 // the current way we should be generating. In other words it encapsulates
1210 // the current state of the code generator. The effect of this is that we
1211 // generate code for paths that the matcher can take through the regular
1212 // expression. A given node in the regexp can be code-generated several times
1213 // as it can be part of several traces. For example for the regexp:
1214 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1215 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1216 // to match foo is generated only once (the traces have a common prefix). The
1217 // code to store the capture is deferred and generated (twice) after the places
1218 // where baz has been matched.
1219 class Trace {
1220  public:
1221  // A value for a property that is either known to be true, know to be false,
1222  // or not known.
1223  enum TriBool {
1224  UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1225  };
1226 
1228  public:
1229  DeferredAction(ActionNode::ActionType action_type, int reg)
1230  : action_type_(action_type), reg_(reg), next_(nullptr) {}
1231  DeferredAction* next() { return next_; }
1232  bool Mentions(int reg);
1233  int reg() { return reg_; }
1234  ActionNode::ActionType action_type() { return action_type_; }
1235  private:
1236  ActionNode::ActionType action_type_;
1237  int reg_;
1238  DeferredAction* next_;
1239  friend class Trace;
1240  };
1241 
1243  public:
1244  DeferredCapture(int reg, bool is_capture, Trace* trace)
1245  : DeferredAction(ActionNode::STORE_POSITION, reg),
1246  cp_offset_(trace->cp_offset()),
1247  is_capture_(is_capture) { }
1248  int cp_offset() { return cp_offset_; }
1249  bool is_capture() { return is_capture_; }
1250  private:
1251  int cp_offset_;
1252  bool is_capture_;
1253  void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1254  };
1255 
1257  public:
1258  DeferredSetRegister(int reg, int value)
1259  : DeferredAction(ActionNode::SET_REGISTER, reg),
1260  value_(value) { }
1261  int value() { return value_; }
1262  private:
1263  int value_;
1264  };
1265 
1267  public:
1268  explicit DeferredClearCaptures(Interval range)
1269  : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1270  range_(range) { }
1271  Interval range() { return range_; }
1272  private:
1273  Interval range_;
1274  };
1275 
1277  public:
1278  explicit DeferredIncrementRegister(int reg)
1279  : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1280  };
1281 
1282  Trace()
1283  : cp_offset_(0),
1284  actions_(nullptr),
1285  backtrack_(nullptr),
1286  stop_node_(nullptr),
1287  loop_label_(nullptr),
1288  characters_preloaded_(0),
1289  bound_checked_up_to_(0),
1290  flush_budget_(100),
1291  at_start_(UNKNOWN) {}
1292 
1293  // End the trace. This involves flushing the deferred actions in the trace
1294  // and pushing a backtrack location onto the backtrack stack. Once this is
1295  // done we can start a new trace or go to one that has already been
1296  // generated.
1297  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1298  int cp_offset() { return cp_offset_; }
1299  DeferredAction* actions() { return actions_; }
1300  // A trivial trace is one that has no deferred actions or other state that
1301  // affects the assumptions used when generating code. There is no recorded
1302  // backtrack location in a trivial trace, so with a trivial trace we will
1303  // generate code that, on a failure to match, gets the backtrack location
1304  // from the backtrack stack rather than using a direct jump instruction. We
1305  // always start code generation with a trivial trace and non-trivial traces
1306  // are created as we emit code for nodes or add to the list of deferred
1307  // actions in the trace. The location of the code generated for a node using
1308  // a trivial trace is recorded in a label in the node so that gotos can be
1309  // generated to that code.
1310  bool is_trivial() {
1311  return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1312  characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1313  quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1314  }
1315  TriBool at_start() { return at_start_; }
1316  void set_at_start(TriBool at_start) { at_start_ = at_start; }
1317  Label* backtrack() { return backtrack_; }
1318  Label* loop_label() { return loop_label_; }
1319  RegExpNode* stop_node() { return stop_node_; }
1320  int characters_preloaded() { return characters_preloaded_; }
1321  int bound_checked_up_to() { return bound_checked_up_to_; }
1322  int flush_budget() { return flush_budget_; }
1323  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1324  bool mentions_reg(int reg);
1325  // Returns true if a deferred position store exists to the specified
1326  // register and stores the offset in the out-parameter. Otherwise
1327  // returns false.
1328  bool GetStoredPosition(int reg, int* cp_offset);
1329  // These set methods and AdvanceCurrentPositionInTrace should be used only on
1330  // new traces - the intention is that traces are immutable after creation.
1331  void add_action(DeferredAction* new_action) {
1332  DCHECK(new_action->next_ == nullptr);
1333  new_action->next_ = actions_;
1334  actions_ = new_action;
1335  }
1336  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1337  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1338  void set_loop_label(Label* label) { loop_label_ = label; }
1339  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1340  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1341  void set_flush_budget(int to) { flush_budget_ = to; }
1342  void set_quick_check_performed(QuickCheckDetails* d) {
1343  quick_check_performed_ = *d;
1344  }
1345  void InvalidateCurrentCharacter();
1346  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1347 
1348  private:
1349  int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1350  void PerformDeferredActions(RegExpMacroAssembler* macro,
1351  int max_register,
1352  const OutSet& affected_registers,
1353  OutSet* registers_to_pop,
1354  OutSet* registers_to_clear,
1355  Zone* zone);
1356  void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1357  int max_register,
1358  const OutSet& registers_to_pop,
1359  const OutSet& registers_to_clear);
1360  int cp_offset_;
1361  DeferredAction* actions_;
1362  Label* backtrack_;
1363  RegExpNode* stop_node_;
1364  Label* loop_label_;
1365  int characters_preloaded_;
1366  int bound_checked_up_to_;
1367  QuickCheckDetails quick_check_performed_;
1368  int flush_budget_;
1369  TriBool at_start_;
1370 };
1371 
1372 
1374  public:
1375  explicit GreedyLoopState(bool not_at_start);
1376 
1377  Label* label() { return &label_; }
1378  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1379 
1380  private:
1381  Label label_;
1382  Trace counter_backtrack_trace_;
1383 };
1384 
1385 
1387  static const int kEatsAtLeastNotYetInitialized = -1;
1388  bool preload_is_current_;
1389  bool preload_has_checked_bounds_;
1390  int preload_characters_;
1391  int eats_at_least_;
1392  void init() {
1393  eats_at_least_ = kEatsAtLeastNotYetInitialized;
1394  }
1395 };
1396 
1397 
1399  public:
1400  virtual ~NodeVisitor() = default;
1401 #define DECLARE_VISIT(Type) \
1402  virtual void Visit##Type(Type##Node* that) = 0;
1403 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1404 #undef DECLARE_VISIT
1405  virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1406 };
1407 
1408 
1409 // Node visitor used to add the start set of the alternatives to the
1410 // dispatch table of a choice node.
1412  public:
1413  DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1414  Zone* zone)
1415  : table_(table),
1416  choice_index_(-1),
1417  ignore_case_(ignore_case),
1418  zone_(zone) { }
1419 
1420  void BuildTable(ChoiceNode* node);
1421 
1422  void AddRange(CharacterRange range) {
1423  table()->AddRange(range, choice_index_, zone_);
1424  }
1425 
1426  void AddInverse(ZoneList<CharacterRange>* ranges);
1427 
1428 #define DECLARE_VISIT(Type) \
1429  virtual void Visit##Type(Type##Node* that);
1430 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1431 #undef DECLARE_VISIT
1432 
1433  DispatchTable* table() { return table_; }
1434  void set_choice_index(int value) { choice_index_ = value; }
1435 
1436  protected:
1437  DispatchTable* table_;
1438  int choice_index_;
1439  bool ignore_case_;
1440  Zone* zone_;
1441 };
1442 
1443 
1444 // Assertion propagation moves information about assertions such as
1445 // \b to the affected nodes. For instance, in /.\b./ information must
1446 // be propagated to the first '.' that whatever follows needs to know
1447 // if it matched a word or a non-word, and to the second '.' that it
1448 // has to check if it succeeds a word or non-word. In this case the
1449 // result will be something like:
1450 //
1451 // +-------+ +------------+
1452 // | . | | . |
1453 // +-------+ ---> +------------+
1454 // | word? | | check word |
1455 // +-------+ +------------+
1456 class Analysis: public NodeVisitor {
1457  public:
1458  Analysis(Isolate* isolate, bool is_one_byte)
1459  : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
1460  void EnsureAnalyzed(RegExpNode* node);
1461 
1462 #define DECLARE_VISIT(Type) void Visit##Type(Type##Node* that) override;
1463  FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1464 #undef DECLARE_VISIT
1465  void VisitLoopChoice(LoopChoiceNode* that) override;
1466 
1467  bool has_failed() { return error_message_ != nullptr; }
1468  const char* error_message() {
1469  DCHECK(error_message_ != nullptr);
1470  return error_message_;
1471  }
1472  void fail(const char* error_message) {
1473  error_message_ = error_message;
1474  }
1475 
1476  Isolate* isolate() const { return isolate_; }
1477 
1478  private:
1479  Isolate* isolate_;
1480  bool is_one_byte_;
1481  const char* error_message_;
1482 
1483  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1484 };
1485 
1486 
1489  : tree(nullptr),
1490  node(nullptr),
1491  simple(true),
1492  contains_anchor(false),
1493  capture_count(0) {}
1494  RegExpTree* tree;
1495  RegExpNode* node;
1496  bool simple;
1497  bool contains_anchor;
1498  Handle<FixedArray> capture_name_map;
1499  Handle<String> error;
1500  int capture_count;
1501 };
1502 
1503 
1504 class RegExpEngine: public AllStatic {
1505  public:
1507  inline CompilationResult(Isolate* isolate, const char* error_message);
1508  CompilationResult(Object* code, int registers)
1509  : code(code), num_registers(registers) {}
1510  const char* const error_message = nullptr;
1511  Object* const code;
1512  int const num_registers = 0;
1513  };
1514 
1515  static CompilationResult Compile(Isolate* isolate, Zone* zone,
1516  RegExpCompileData* input,
1517  JSRegExp::Flags flags,
1518  Handle<String> pattern,
1519  Handle<String> sample_subject,
1520  bool is_one_byte);
1521 
1522  static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
1523 
1524  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1525 };
1526 
1527 
1529  public:
1530  enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1531 
1532  // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1533  // On success, the returned result is guaranteed to be a COW-array.
1534  static Object* Lookup(Heap* heap, String key_string, Object* key_pattern,
1535  FixedArray* last_match_out, ResultsCacheType type);
1536  // Attempt to add value_array to the cache specified by type. On success,
1537  // value_array is turned into a COW-array.
1538  static void Enter(Isolate* isolate, Handle<String> key_string,
1539  Handle<Object> key_pattern, Handle<FixedArray> value_array,
1540  Handle<FixedArray> last_match_cache, ResultsCacheType type);
1541  static void Clear(FixedArray cache);
1542  static const int kRegExpResultsCacheSize = 0x100;
1543 
1544  private:
1545  static const int kArrayEntriesPerCacheEntry = 4;
1546  static const int kStringOffset = 0;
1547  static const int kPatternOffset = 1;
1548  static const int kArrayOffset = 2;
1549  static const int kLastMatchOffset = 3;
1550 };
1551 
1552 } // namespace internal
1553 } // namespace v8
1554 
1555 #endif // V8_REGEXP_JSREGEXP_H_
Definition: libplatform.h:13
Definition: v8.h:3740