V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
scanner-inl.h
1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_PARSING_SCANNER_INL_H_
6 #define V8_PARSING_SCANNER_INL_H_
7 
8 #include "src/char-predicates-inl.h"
9 #include "src/parsing/keywords-gen.h"
10 #include "src/parsing/scanner.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 // ----------------------------------------------------------------------------
16 // Keyword Matcher
17 
18 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
19  KEYWORD_GROUP('a') \
20  KEYWORD("async", Token::ASYNC) \
21  KEYWORD("await", Token::AWAIT) \
22  KEYWORD_GROUP('b') \
23  KEYWORD("break", Token::BREAK) \
24  KEYWORD_GROUP('c') \
25  KEYWORD("case", Token::CASE) \
26  KEYWORD("catch", Token::CATCH) \
27  KEYWORD("class", Token::CLASS) \
28  KEYWORD("const", Token::CONST) \
29  KEYWORD("continue", Token::CONTINUE) \
30  KEYWORD_GROUP('d') \
31  KEYWORD("debugger", Token::DEBUGGER) \
32  KEYWORD("default", Token::DEFAULT) \
33  KEYWORD("delete", Token::DELETE) \
34  KEYWORD("do", Token::DO) \
35  KEYWORD_GROUP('e') \
36  KEYWORD("else", Token::ELSE) \
37  KEYWORD("enum", Token::ENUM) \
38  KEYWORD("export", Token::EXPORT) \
39  KEYWORD("extends", Token::EXTENDS) \
40  KEYWORD_GROUP('f') \
41  KEYWORD("false", Token::FALSE_LITERAL) \
42  KEYWORD("finally", Token::FINALLY) \
43  KEYWORD("for", Token::FOR) \
44  KEYWORD("function", Token::FUNCTION) \
45  KEYWORD_GROUP('i') \
46  KEYWORD("if", Token::IF) \
47  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
48  KEYWORD("import", Token::IMPORT) \
49  KEYWORD("in", Token::IN) \
50  KEYWORD("instanceof", Token::INSTANCEOF) \
51  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
52  KEYWORD_GROUP('l') \
53  KEYWORD("let", Token::LET) \
54  KEYWORD_GROUP('n') \
55  KEYWORD("new", Token::NEW) \
56  KEYWORD("null", Token::NULL_LITERAL) \
57  KEYWORD_GROUP('p') \
58  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
59  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
60  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
61  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
62  KEYWORD_GROUP('r') \
63  KEYWORD("return", Token::RETURN) \
64  KEYWORD_GROUP('s') \
65  KEYWORD("static", Token::STATIC) \
66  KEYWORD("super", Token::SUPER) \
67  KEYWORD("switch", Token::SWITCH) \
68  KEYWORD_GROUP('t') \
69  KEYWORD("this", Token::THIS) \
70  KEYWORD("throw", Token::THROW) \
71  KEYWORD("true", Token::TRUE_LITERAL) \
72  KEYWORD("try", Token::TRY) \
73  KEYWORD("typeof", Token::TYPEOF) \
74  KEYWORD_GROUP('v') \
75  KEYWORD("var", Token::VAR) \
76  KEYWORD("void", Token::VOID) \
77  KEYWORD_GROUP('w') \
78  KEYWORD("while", Token::WHILE) \
79  KEYWORD("with", Token::WITH) \
80  KEYWORD_GROUP('y') \
81  KEYWORD("yield", Token::YIELD)
82 
83 constexpr bool IsKeywordStart(char c) {
84 #define KEYWORD_GROUP_CHECK(ch) c == ch ||
85 #define KEYWORD_CHECK(keyword, token)
86  return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false;
87 #undef KEYWORD_GROUP_CHECK
88 #undef KEYWORD_CHECK
89 }
90 
91 V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
92  int input_length) {
93  DCHECK_GE(input_length, 1);
94  return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
95  input_length);
96 }
97 
98 // Recursive constexpr template magic to check if a character is in a given
99 // string.
100 template <int N>
101 constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0) {
102  return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1);
103 }
104 
105 inline constexpr bool CanBeKeywordCharacter(char c) {
106  return IsInString(
107 #define KEYWORD_GROUP_CASE(ch) // Nothing
108 #define KEYWORD(keyword, token) keyword
109  // Use C string literal concatenation ("a" "b" becomes "ab") to build one
110  // giant string containing all the keywords.
111  KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
112 #undef KEYWORD
113 #undef KEYWORD_GROUP_CASE
114  ,
115  c);
116 }
117 
118 // Make sure tokens are stored as a single byte.
119 STATIC_ASSERT(sizeof(Token::Value) == 1);
120 
121 // Get the shortest token that this character starts, the token may change
122 // depending on subsequent characters.
123 constexpr Token::Value GetOneCharToken(char c) {
124  // clang-format off
125  return
126  c == '(' ? Token::LPAREN :
127  c == ')' ? Token::RPAREN :
128  c == '{' ? Token::LBRACE :
129  c == '}' ? Token::RBRACE :
130  c == '[' ? Token::LBRACK :
131  c == ']' ? Token::RBRACK :
132  c == '?' ? Token::CONDITIONAL :
133  c == ':' ? Token::COLON :
134  c == ';' ? Token::SEMICOLON :
135  c == ',' ? Token::COMMA :
136  c == '.' ? Token::PERIOD :
137  c == '|' ? Token::BIT_OR :
138  c == '&' ? Token::BIT_AND :
139  c == '^' ? Token::BIT_XOR :
140  c == '~' ? Token::BIT_NOT :
141  c == '!' ? Token::NOT :
142  c == '<' ? Token::LT :
143  c == '>' ? Token::GT :
144  c == '%' ? Token::MOD :
145  c == '=' ? Token::ASSIGN :
146  c == '+' ? Token::ADD :
147  c == '-' ? Token::SUB :
148  c == '*' ? Token::MUL :
149  c == '/' ? Token::DIV :
150  c == '#' ? Token::PRIVATE_NAME :
151  c == '"' ? Token::STRING :
152  c == '\'' ? Token::STRING :
153  c == '`' ? Token::TEMPLATE_SPAN :
154  c == '\\' ? Token::IDENTIFIER :
155  // Whitespace or line terminator
156  c == ' ' ? Token::WHITESPACE :
157  c == '\t' ? Token::WHITESPACE :
158  c == '\v' ? Token::WHITESPACE :
159  c == '\f' ? Token::WHITESPACE :
160  c == '\r' ? Token::WHITESPACE :
161  c == '\n' ? Token::WHITESPACE :
162  // IsDecimalDigit must be tested before IsAsciiIdentifier
163  IsDecimalDigit(c) ? Token::NUMBER :
164  IsAsciiIdentifier(c) ? Token::IDENTIFIER :
165  Token::ILLEGAL;
166  // clang-format on
167 }
168 
169 // Table of one-character tokens, by character (0x00..0x7F only).
170 static const constexpr Token::Value one_char_tokens[128] = {
171 #define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N),
172  INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
173 #undef CALL_GET_SCAN_FLAGS
174 };
175 
176 #undef KEYWORDS
177 
178 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() {
179  next().literal_chars.Start();
180  return ScanIdentifierOrKeywordInner();
181 }
182 
183 // Character flags for the fast path of scanning a keyword or identifier token.
184 enum class ScanFlags : uint8_t {
185  kTerminatesLiteral = 1 << 0,
186  // "Cannot" rather than "can" so that this flag can be ORed together across
187  // multiple characters.
188  kCannotBeKeyword = 1 << 1,
189  kCannotBeKeywordStart = 1 << 2,
190  kStringTerminator = 1 << 3,
191  kNeedsSlowPath = 1 << 4,
192 };
193 constexpr uint8_t GetScanFlags(char c) {
194  return
195  // Keywords are all lowercase and only contain letters.
196  // Note that non-identifier characters do not set this flag, so
197  // that it plays well with kTerminatesLiteral.
198  (IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c)
199  ? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
200  : 0) |
201  (IsKeywordStart(c)
202  ? 0
203  : static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart)) |
204  // Anything that isn't an identifier character will terminate the
205  // literal, or at least terminates the literal fast path processing
206  // (like an escape).
207  (!IsAsciiIdentifier(c)
208  ? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
209  : 0) |
210  // Possible string termination characters.
211  ((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\')
212  ? static_cast<uint8_t>(ScanFlags::kStringTerminator)
213  : 0) |
214  // Escapes are processed on the slow path.
215  (c == '\\' ? static_cast<uint8_t>(ScanFlags::kNeedsSlowPath) : 0);
216 }
217 inline bool TerminatesLiteral(uint8_t scan_flags) {
218  return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
219 }
220 inline bool CanBeKeyword(uint8_t scan_flags) {
221  return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
222 }
223 inline bool NeedsSlowPath(uint8_t scan_flags) {
224  return (scan_flags & static_cast<uint8_t>(ScanFlags::kNeedsSlowPath));
225 }
226 inline bool MayTerminateString(uint8_t scan_flags) {
227  return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator));
228 }
229 // Table of precomputed scan flags for the 128 ASCII characters, for branchless
230 // flag calculation during the scan.
231 static constexpr const uint8_t character_scan_flags[128] = {
232 #define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
233  INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
234 #undef CALL_GET_SCAN_FLAGS
235 };
236 
237 inline bool CharCanBeKeyword(uc32 c) {
238  return static_cast<uint32_t>(c) < arraysize(character_scan_flags) &&
239  CanBeKeyword(character_scan_flags[c]);
240 }
241 
242 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
243  DCHECK(IsIdentifierStart(c0_));
244  bool escaped = false;
245  bool can_be_keyword = true;
246 
247  STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
248  if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
249  if (V8_LIKELY(c0_ != '\\')) {
250  uint8_t scan_flags = character_scan_flags[c0_];
251  DCHECK(!TerminatesLiteral(scan_flags));
252  STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) ==
253  static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1);
254  scan_flags >>= 1;
255  // Make sure the shifting above doesn't set NeedsSlowPath. Otherwise we'll
256  // fall into the slow path after scanning the identifier.
257  DCHECK(!NeedsSlowPath(scan_flags));
258  AddLiteralChar(static_cast<char>(c0_));
259  AdvanceUntil([this, &scan_flags](uc32 c0) {
260  if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
261  // A non-ascii character means we need to drop through to the slow
262  // path.
263  // TODO(leszeks): This would be most efficient as a goto to the slow
264  // path, check codegen and maybe use a bool instead.
265  scan_flags |= static_cast<uint8_t>(ScanFlags::kNeedsSlowPath);
266  return true;
267  }
268  uint8_t char_flags = character_scan_flags[c0];
269  scan_flags |= char_flags;
270  if (TerminatesLiteral(char_flags)) {
271  return true;
272  } else {
273  AddLiteralChar(static_cast<char>(c0));
274  return false;
275  }
276  });
277 
278  if (V8_LIKELY(!NeedsSlowPath(scan_flags))) {
279  if (!CanBeKeyword(scan_flags)) return Token::IDENTIFIER;
280  // Could be a keyword or identifier.
281  Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
282  return KeywordOrIdentifierToken(chars.start(), chars.length());
283  }
284 
285  can_be_keyword = CanBeKeyword(scan_flags);
286  } else {
287  // Special case for escapes at the start of an identifier.
288  escaped = true;
289  uc32 c = ScanIdentifierUnicodeEscape();
290  DCHECK(!IsIdentifierStart(-1));
291  if (c == '\\' || !IsIdentifierStart(c)) {
292  return Token::ILLEGAL;
293  }
294  AddLiteralChar(c);
295  can_be_keyword = CharCanBeKeyword(c);
296  }
297  }
298 
299  return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
300 }
301 
302 V8_INLINE Token::Value Scanner::SkipWhiteSpace() {
303  int start_position = source_pos();
304 
305  // We won't skip behind the end of input.
306  DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
307 
308  // Advance as long as character is a WhiteSpace or LineTerminator.
309  while (IsWhiteSpaceOrLineTerminator(c0_)) {
310  if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) {
311  next().after_line_terminator = true;
312  }
313  Advance();
314  }
315 
316  // Return whether or not we skipped any characters.
317  if (source_pos() == start_position) {
318  DCHECK_NE('0', c0_);
319  return Token::ILLEGAL;
320  }
321 
322  return Token::WHITESPACE;
323 }
324 
325 V8_INLINE Token::Value Scanner::ScanSingleToken() {
326  Token::Value token;
327  do {
328  next().location.beg_pos = source_pos();
329 
330  if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) {
331  token = one_char_tokens[c0_];
332 
333  switch (token) {
334  case Token::LPAREN:
335  case Token::RPAREN:
336  case Token::LBRACE:
337  case Token::RBRACE:
338  case Token::LBRACK:
339  case Token::RBRACK:
340  case Token::CONDITIONAL:
341  case Token::COLON:
342  case Token::SEMICOLON:
343  case Token::COMMA:
344  case Token::BIT_NOT:
345  case Token::ILLEGAL:
346  // One character tokens.
347  return Select(token);
348 
349  case Token::STRING:
350  return ScanString();
351 
352  case Token::LT:
353  // < <= << <<= <!--
354  Advance();
355  if (c0_ == '=') return Select(Token::LTE);
356  if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
357  if (c0_ == '!') {
358  token = ScanHtmlComment();
359  continue;
360  }
361  return Token::LT;
362 
363  case Token::GT:
364  // > >= >> >>= >>> >>>=
365  Advance();
366  if (c0_ == '=') return Select(Token::GTE);
367  if (c0_ == '>') {
368  // >> >>= >>> >>>=
369  Advance();
370  if (c0_ == '=') return Select(Token::ASSIGN_SAR);
371  if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
372  return Token::SAR;
373  }
374  return Token::GT;
375 
376  case Token::ASSIGN:
377  // = == === =>
378  Advance();
379  if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
380  if (c0_ == '>') return Select(Token::ARROW);
381  return Token::ASSIGN;
382 
383  case Token::NOT:
384  // ! != !==
385  Advance();
386  if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
387  return Token::NOT;
388 
389  case Token::ADD:
390  // + ++ +=
391  Advance();
392  if (c0_ == '+') return Select(Token::INC);
393  if (c0_ == '=') return Select(Token::ASSIGN_ADD);
394  return Token::ADD;
395 
396  case Token::SUB:
397  // - -- --> -=
398  Advance();
399  if (c0_ == '-') {
400  Advance();
401  if (c0_ == '>' && next().after_line_terminator) {
402  // For compatibility with SpiderMonkey, we skip lines that
403  // start with an HTML comment end '-->'.
404  token = SkipSingleHTMLComment();
405  continue;
406  }
407  return Token::DEC;
408  }
409  if (c0_ == '=') return Select(Token::ASSIGN_SUB);
410  return Token::SUB;
411 
412  case Token::MUL:
413  // * *=
414  Advance();
415  if (c0_ == '*') return Select('=', Token::ASSIGN_EXP, Token::EXP);
416  if (c0_ == '=') return Select(Token::ASSIGN_MUL);
417  return Token::MUL;
418 
419  case Token::MOD:
420  // % %=
421  return Select('=', Token::ASSIGN_MOD, Token::MOD);
422 
423  case Token::DIV:
424  // / // /* /=
425  Advance();
426  if (c0_ == '/') {
427  uc32 c = Peek();
428  if (c == '#' || c == '@') {
429  Advance();
430  Advance();
431  token = SkipSourceURLComment();
432  continue;
433  }
434  token = SkipSingleLineComment();
435  continue;
436  }
437  if (c0_ == '*') {
438  token = SkipMultiLineComment();
439  continue;
440  }
441  if (c0_ == '=') return Select(Token::ASSIGN_DIV);
442  return Token::DIV;
443 
444  case Token::BIT_AND:
445  // & && &=
446  Advance();
447  if (c0_ == '&') return Select(Token::AND);
448  if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
449  return Token::BIT_AND;
450 
451  case Token::BIT_OR:
452  // | || |=
453  Advance();
454  if (c0_ == '|') return Select(Token::OR);
455  if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
456  return Token::BIT_OR;
457 
458  case Token::BIT_XOR:
459  // ^ ^=
460  return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
461 
462  case Token::PERIOD:
463  // . Number
464  Advance();
465  if (IsDecimalDigit(c0_)) return ScanNumber(true);
466  if (c0_ == '.') {
467  if (Peek() == '.') {
468  Advance();
469  Advance();
470  return Token::ELLIPSIS;
471  }
472  }
473  return Token::PERIOD;
474 
475  case Token::TEMPLATE_SPAN:
476  Advance();
477  return ScanTemplateSpan();
478 
479  case Token::PRIVATE_NAME:
480  return ScanPrivateName();
481 
482  case Token::WHITESPACE:
483  token = SkipWhiteSpace();
484  continue;
485 
486  case Token::NUMBER:
487  return ScanNumber(false);
488 
489  case Token::IDENTIFIER:
490  return ScanIdentifierOrKeyword();
491 
492  default:
493  UNREACHABLE();
494  }
495  }
496 
497  if (IsIdentifierStart(c0_) ||
498  (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
499  return ScanIdentifierOrKeyword();
500  }
501  if (c0_ == kEndOfInput) {
502  return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS;
503  }
504  token = SkipWhiteSpace();
505 
506  // Continue scanning for tokens as long as we're just skipping whitespace.
507  } while (token == Token::WHITESPACE);
508 
509  return token;
510 }
511 
512 void Scanner::Scan(TokenDesc* next_desc) {
513  DCHECK_EQ(next_desc, &next());
514 
515  next_desc->token = ScanSingleToken();
516  DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL);
517  next_desc->location.end_pos = source_pos();
518 
519 #ifdef DEBUG
520  SanityCheckTokenDesc(current());
521  SanityCheckTokenDesc(next());
522  SanityCheckTokenDesc(next_next());
523 #endif
524 }
525 
526 void Scanner::Scan() { Scan(next_); }
527 
528 } // namespace internal
529 } // namespace v8
530 
531 #endif // V8_PARSING_SCANNER_INL_H_
Definition: libplatform.h:13