V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
scanner.cc
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #include "src/parsing/scanner.h"
8 
9 #include <stdint.h>
10 
11 #include <cmath>
12 
13 #include "src/ast/ast-value-factory.h"
14 #include "src/conversions-inl.h"
15 #include "src/objects/bigint.h"
16 #include "src/parsing/scanner-inl.h"
17 #include "src/zone/zone.h"
18 
19 namespace v8 {
20 namespace internal {
21 
23  public:
24  ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
25  : message_stack_(message_stack),
26  old_message_(*message_stack),
27  location_stack_(location_stack),
28  old_location_(*location_stack) {
29  *message_stack_ = MessageTemplate::kNone;
30  *location_stack_ = Location::invalid();
31  }
32 
33  ~ErrorState() {
34  *message_stack_ = old_message_;
35  *location_stack_ = old_location_;
36  }
37 
38  void MoveErrorTo(TokenDesc* dest) {
39  if (*message_stack_ == MessageTemplate::kNone) {
40  return;
41  }
42  if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
43  dest->invalid_template_escape_message = *message_stack_;
44  dest->invalid_template_escape_location = *location_stack_;
45  }
46  *message_stack_ = MessageTemplate::kNone;
47  *location_stack_ = Location::invalid();
48  }
49 
50  private:
51  MessageTemplate* const message_stack_;
52  MessageTemplate const old_message_;
53  Scanner::Location* const location_stack_;
54  Scanner::Location const old_location_;
55 };
56 
57 // ----------------------------------------------------------------------------
58 // Scanner::LiteralBuffer
59 
60 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
61  if (is_one_byte()) {
62  return isolate->factory()->InternalizeOneByteString(one_byte_literal());
63  }
64  return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
65 }
66 
67 int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
68  int capacity = Max(min_capacity, backing_store_.length());
69  int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
70  return new_capacity;
71 }
72 
73 void Scanner::LiteralBuffer::ExpandBuffer() {
74  Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
75  MemCopy(new_store.start(), backing_store_.start(), position_);
76  backing_store_.Dispose();
77  backing_store_ = new_store;
78 }
79 
80 void Scanner::LiteralBuffer::ConvertToTwoByte() {
81  DCHECK(is_one_byte());
82  Vector<byte> new_store;
83  int new_content_size = position_ * kUC16Size;
84  if (new_content_size >= backing_store_.length()) {
85  // Ensure room for all currently read code units as UC16 as well
86  // as the code unit about to be stored.
87  new_store = Vector<byte>::New(NewCapacity(new_content_size));
88  } else {
89  new_store = backing_store_;
90  }
91  uint8_t* src = backing_store_.start();
92  uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
93  for (int i = position_ - 1; i >= 0; i--) {
94  dst[i] = src[i];
95  }
96  if (new_store.start() != backing_store_.start()) {
97  backing_store_.Dispose();
98  backing_store_ = new_store;
99  }
100  position_ = new_content_size;
101  is_one_byte_ = false;
102 }
103 
104 void Scanner::LiteralBuffer::AddTwoByteChar(uc32 code_unit) {
105  DCHECK(!is_one_byte());
106  if (position_ >= backing_store_.length()) ExpandBuffer();
107  if (code_unit <=
108  static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
109  *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
110  position_ += kUC16Size;
111  } else {
112  *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
113  unibrow::Utf16::LeadSurrogate(code_unit);
114  position_ += kUC16Size;
115  if (position_ >= backing_store_.length()) ExpandBuffer();
116  *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
117  unibrow::Utf16::TrailSurrogate(code_unit);
118  position_ += kUC16Size;
119  }
120 }
121 
122 // ----------------------------------------------------------------------------
123 // Scanner::BookmarkScope
124 
125 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
126  std::numeric_limits<size_t>::max() - 2;
127 const size_t Scanner::BookmarkScope::kNoBookmark =
128  std::numeric_limits<size_t>::max() - 1;
129 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
130  std::numeric_limits<size_t>::max();
131 
132 void Scanner::BookmarkScope::Set() {
133  DCHECK_EQ(bookmark_, kNoBookmark);
134 
135  // The first token is a bit special, since current_ will still be
136  // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
137  // when
138  // applying the bookmark.
139  DCHECK_IMPLIES(scanner_->current().token == Token::UNINITIALIZED,
140  scanner_->current().location.beg_pos ==
141  scanner_->next().location.beg_pos);
142  bookmark_ = (scanner_->current().token == Token::UNINITIALIZED)
143  ? kBookmarkAtFirstPos
144  : scanner_->location().beg_pos;
145 }
146 
147 void Scanner::BookmarkScope::Apply() {
148  DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
149  if (had_parser_error_) {
150  scanner_->set_parser_error();
151  } else {
152  scanner_->reset_parser_error_flag();
153  if (bookmark_ == kBookmarkAtFirstPos) {
154  scanner_->SeekNext(0);
155  } else {
156  scanner_->SeekNext(bookmark_);
157  scanner_->Next();
158  DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
159  }
160  }
161  bookmark_ = kBookmarkWasApplied;
162 }
163 
164 bool Scanner::BookmarkScope::HasBeenSet() const {
165  return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
166 }
167 
168 bool Scanner::BookmarkScope::HasBeenApplied() const {
169  return bookmark_ == kBookmarkWasApplied;
170 }
171 
172 // ----------------------------------------------------------------------------
173 // Scanner
174 
175 Scanner::Scanner(Utf16CharacterStream* source, bool is_module)
176  : source_(source),
177  found_html_comment_(false),
178  allow_harmony_numeric_separator_(false),
179  is_module_(is_module),
180  octal_pos_(Location::invalid()),
181  octal_message_(MessageTemplate::kNone) {
182  DCHECK_NOT_NULL(source);
183 }
184 
185 void Scanner::Initialize() {
186  // Need to capture identifiers in order to recognize "get" and "set"
187  // in object literals.
188  Init();
189  next().after_line_terminator = true;
190  Scan();
191 }
192 
193 template <bool capture_raw, bool unicode>
194 uc32 Scanner::ScanHexNumber(int expected_length) {
195  DCHECK_LE(expected_length, 4); // prevent overflow
196 
197  int begin = source_pos() - 2;
198  uc32 x = 0;
199  for (int i = 0; i < expected_length; i++) {
200  int d = HexValue(c0_);
201  if (d < 0) {
202  ReportScannerError(Location(begin, begin + expected_length + 2),
203  unicode
204  ? MessageTemplate::kInvalidUnicodeEscapeSequence
205  : MessageTemplate::kInvalidHexEscapeSequence);
206  return -1;
207  }
208  x = x * 16 + d;
209  Advance<capture_raw>();
210  }
211 
212  return x;
213 }
214 
215 template <bool capture_raw>
216 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
217  uc32 x = 0;
218  int d = HexValue(c0_);
219  if (d < 0) return -1;
220 
221  while (d >= 0) {
222  x = x * 16 + d;
223  if (x > max_value) {
224  ReportScannerError(Location(beg_pos, source_pos() + 1),
225  MessageTemplate::kUndefinedUnicodeCodePoint);
226  return -1;
227  }
228  Advance<capture_raw>();
229  d = HexValue(c0_);
230  }
231 
232  return x;
233 }
234 
235 Token::Value Scanner::Next() {
236  // Rotate through tokens.
237  TokenDesc* previous = current_;
238  current_ = next_;
239  // Either we already have the next token lined up, in which case next_next_
240  // simply becomes next_. In that case we use current_ as new next_next_ and
241  // clear its token to indicate that it wasn't scanned yet. Otherwise we use
242  // current_ as next_ and scan into it, leaving next_next_ uninitialized.
243  if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
244  next_ = previous;
245  // User 'previous' instead of 'next_' because for some reason the compiler
246  // thinks 'next_' could be modified before the entry into Scan.
247  previous->after_line_terminator = false;
248  Scan(previous);
249  } else {
250  next_ = next_next_;
251  next_next_ = previous;
252  previous->token = Token::UNINITIALIZED;
253  DCHECK_NE(Token::UNINITIALIZED, current().token);
254  }
255  return current().token;
256 }
257 
258 Token::Value Scanner::PeekAhead() {
259  DCHECK(next().token != Token::DIV);
260  DCHECK(next().token != Token::ASSIGN_DIV);
261 
262  if (next_next().token != Token::UNINITIALIZED) {
263  return next_next().token;
264  }
265  TokenDesc* temp = next_;
266  next_ = next_next_;
267  next().after_line_terminator = false;
268  Scan();
269  next_next_ = next_;
270  next_ = temp;
271  return next_next().token;
272 }
273 
274 Token::Value Scanner::SkipSingleHTMLComment() {
275  if (is_module_) {
276  ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
277  return Token::ILLEGAL;
278  }
279  return SkipSingleLineComment();
280 }
281 
282 Token::Value Scanner::SkipSingleLineComment() {
283  // The line terminator at the end of the line is not considered
284  // to be part of the single-line comment; it is recognized
285  // separately by the lexical grammar and becomes part of the
286  // stream of input elements for the syntactic grammar (see
287  // ECMA-262, section 7.4).
288  AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
289 
290  return Token::WHITESPACE;
291 }
292 
293 Token::Value Scanner::SkipSourceURLComment() {
294  TryToParseSourceURLComment();
295  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
296  Advance();
297  }
298 
299  return Token::WHITESPACE;
300 }
301 
302 void Scanner::TryToParseSourceURLComment() {
303  // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
304  // function will just return if it cannot parse a magic comment.
305  DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
306  if (!IsWhiteSpace(c0_)) return;
307  Advance();
308  LiteralBuffer name;
309  name.Start();
310 
311  while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
312  c0_ != '=') {
313  name.AddChar(c0_);
314  Advance();
315  }
316  if (!name.is_one_byte()) return;
317  Vector<const uint8_t> name_literal = name.one_byte_literal();
318  LiteralBuffer* value;
319  if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
320  value = &source_url_;
321  } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
322  value = &source_mapping_url_;
323  } else {
324  return;
325  }
326  if (c0_ != '=')
327  return;
328  value->Start();
329  Advance();
330  while (IsWhiteSpace(c0_)) {
331  Advance();
332  }
333  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
334  // Disallowed characters.
335  if (c0_ == '"' || c0_ == '\'') {
336  value->Start();
337  return;
338  }
339  if (IsWhiteSpace(c0_)) {
340  break;
341  }
342  value->AddChar(c0_);
343  Advance();
344  }
345  // Allow whitespace at the end.
346  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
347  if (!IsWhiteSpace(c0_)) {
348  value->Start();
349  break;
350  }
351  Advance();
352  }
353 }
354 
355 Token::Value Scanner::SkipMultiLineComment() {
356  DCHECK_EQ(c0_, '*');
357  Advance();
358 
359  while (c0_ != kEndOfInput) {
360  DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
361  if (!HasLineTerminatorBeforeNext() && unibrow::IsLineTerminator(c0_)) {
362  // Following ECMA-262, section 7.4, a comment containing
363  // a newline will make the comment count as a line-terminator.
364  next().after_line_terminator = true;
365  }
366 
367  while (V8_UNLIKELY(c0_ == '*')) {
368  Advance();
369  if (c0_ == '/') {
370  Advance();
371  return Token::WHITESPACE;
372  }
373  }
374  Advance();
375  }
376 
377  // Unterminated multi-line comment.
378  return Token::ILLEGAL;
379 }
380 
381 Token::Value Scanner::ScanHtmlComment() {
382  // Check for <!-- comments.
383  DCHECK_EQ(c0_, '!');
384  Advance();
385  if (c0_ != '-' || Peek() != '-') {
386  PushBack('!'); // undo Advance()
387  return Token::LT;
388  }
389  Advance();
390 
391  found_html_comment_ = true;
392  return SkipSingleHTMLComment();
393 }
394 
395 #ifdef DEBUG
396 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
397  // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
398  // ILLEGAL and UNINITIALIZED can have garbage for the field.
399 
400  switch (token.token) {
401  case Token::UNINITIALIZED:
402  case Token::ILLEGAL:
403  // token.literal_chars & other members might be garbage. That's ok.
404  case Token::TEMPLATE_SPAN:
405  case Token::TEMPLATE_TAIL:
406  break;
407  default:
408  DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
409  break;
410  }
411 }
412 #endif // DEBUG
413 
414 void Scanner::SeekForward(int pos) {
415  // After this call, we will have the token at the given position as
416  // the "next" token. The "current" token will be invalid.
417  if (pos == next().location.beg_pos) return;
418  int current_pos = source_pos();
419  DCHECK_EQ(next().location.end_pos, current_pos);
420  // Positions inside the lookahead token aren't supported.
421  DCHECK(pos >= current_pos);
422  if (pos != current_pos) {
423  source_->Seek(pos);
424  Advance();
425  // This function is only called to seek to the location
426  // of the end of a function (at the "}" token). It doesn't matter
427  // whether there was a line terminator in the part we skip.
428  next().after_line_terminator = false;
429  }
430  Scan();
431 }
432 
433 template <bool capture_raw>
434 bool Scanner::ScanEscape() {
435  uc32 c = c0_;
436  Advance<capture_raw>();
437 
438  // Skip escaped newlines.
439  DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
440  if (!capture_raw && unibrow::IsLineTerminator(c)) {
441  // Allow escaped CR+LF newlines in multiline string literals.
442  if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
443  return true;
444  }
445 
446  switch (c) {
447  case '\'': // fall through
448  case '"' : // fall through
449  case '\\': break;
450  case 'b' : c = '\b'; break;
451  case 'f' : c = '\f'; break;
452  case 'n' : c = '\n'; break;
453  case 'r' : c = '\r'; break;
454  case 't' : c = '\t'; break;
455  case 'u' : {
456  c = ScanUnicodeEscape<capture_raw>();
457  if (c < 0) return false;
458  break;
459  }
460  case 'v':
461  c = '\v';
462  break;
463  case 'x': {
464  c = ScanHexNumber<capture_raw>(2);
465  if (c < 0) return false;
466  break;
467  }
468  case '0': // Fall through.
469  case '1': // fall through
470  case '2': // fall through
471  case '3': // fall through
472  case '4': // fall through
473  case '5': // fall through
474  case '6': // fall through
475  case '7':
476  c = ScanOctalEscape<capture_raw>(c, 2);
477  break;
478  }
479 
480  // Other escaped characters are interpreted as their non-escaped version.
481  AddLiteralChar(c);
482  return true;
483 }
484 
485 template <bool capture_raw>
486 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
487  uc32 x = c - '0';
488  int i = 0;
489  for (; i < length; i++) {
490  int d = c0_ - '0';
491  if (d < 0 || d > 7) break;
492  int nx = x * 8 + d;
493  if (nx >= 256) break;
494  x = nx;
495  Advance<capture_raw>();
496  }
497  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
498  // Remember the position of octal escape sequences so that an error
499  // can be reported later (in strict mode).
500  // We don't report the error immediately, because the octal escape can
501  // occur before the "use strict" directive.
502  if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
503  octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
504  octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
505  : MessageTemplate::kStrictOctalEscape;
506  }
507  return x;
508 }
509 
510 Token::Value Scanner::ScanString() {
511  uc32 quote = c0_;
512  Advance(); // consume quote
513 
514  next().literal_chars.Start();
515  while (true) {
516  if (V8_UNLIKELY(c0_ == kEndOfInput)) return Token::ILLEGAL;
517  if ((V8_UNLIKELY(static_cast<uint32_t>(c0_) >= kMaxAscii) &&
518  !unibrow::IsStringLiteralLineTerminator(c0_)) ||
519  !MayTerminateString(character_scan_flags[c0_])) {
520  AddLiteralChar(c0_);
521  AdvanceUntil([this](uc32 c0) {
522  if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
523  if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
524  return true;
525  }
526  AddLiteralChar(c0);
527  return false;
528  }
529  uint8_t char_flags = character_scan_flags[c0];
530  if (MayTerminateString(char_flags)) return true;
531  AddLiteralChar(c0);
532  return false;
533  });
534  }
535  if (c0_ == quote) {
536  Advance();
537  return Token::STRING;
538  }
539  if (c0_ == '\\') {
540  Advance();
541  // TODO(verwaest): Check whether we can remove the additional check.
542  if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
543  return Token::ILLEGAL;
544  }
545  continue;
546  }
547  if (V8_UNLIKELY(c0_ == kEndOfInput ||
548  unibrow::IsStringLiteralLineTerminator(c0_))) {
549  return Token::ILLEGAL;
550  }
551  DCHECK_NE(quote, c0_);
552  DCHECK((c0_ == '\'' || c0_ == '"'));
553  AddLiteralCharAdvance();
554  }
555 }
556 
557 Token::Value Scanner::ScanPrivateName() {
558  if (!allow_harmony_private_fields()) {
559  ReportScannerError(source_pos(),
560  MessageTemplate::kInvalidOrUnexpectedToken);
561  return Token::ILLEGAL;
562  }
563 
564  next().literal_chars.Start();
565  DCHECK_EQ(c0_, '#');
566  DCHECK(!IsIdentifierStart(kEndOfInput));
567  if (!IsIdentifierStart(Peek())) {
568  ReportScannerError(source_pos(),
569  MessageTemplate::kInvalidOrUnexpectedToken);
570  return Token::ILLEGAL;
571  }
572 
573  AddLiteralCharAdvance();
574  Token::Value token = ScanIdentifierOrKeywordInner();
575  return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
576 }
577 
578 Token::Value Scanner::ScanTemplateSpan() {
579  // When scanning a TemplateSpan, we are looking for the following construct:
580  // TEMPLATE_SPAN ::
581  // ` LiteralChars* ${
582  // | } LiteralChars* ${
583  //
584  // TEMPLATE_TAIL ::
585  // ` LiteralChars* `
586  // | } LiteralChar* `
587  //
588  // A TEMPLATE_SPAN should always be followed by an Expression, while a
589  // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
590  // followed by an Expression.
591 
592  // These scoped helpers save and restore the original error state, so that we
593  // can specially treat invalid escape sequences in templates (which are
594  // handled by the parser).
595  ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
596  ErrorState octal_error_state(&octal_message_, &octal_pos_);
597 
598  Token::Value result = Token::TEMPLATE_SPAN;
599  next().literal_chars.Start();
600  next().raw_literal_chars.Start();
601  const bool capture_raw = true;
602  while (true) {
603  uc32 c = c0_;
604  if (c == '`') {
605  Advance(); // Consume '`'
606  result = Token::TEMPLATE_TAIL;
607  break;
608  } else if (c == '$' && Peek() == '{') {
609  Advance(); // Consume '$'
610  Advance(); // Consume '{'
611  break;
612  } else if (c == '\\') {
613  Advance(); // Consume '\\'
614  DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
615  if (capture_raw) AddRawLiteralChar('\\');
616  if (unibrow::IsLineTerminator(c0_)) {
617  // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
618  // code unit sequence.
619  uc32 lastChar = c0_;
620  Advance();
621  if (lastChar == '\r') {
622  // Also skip \n.
623  if (c0_ == '\n') Advance();
624  lastChar = '\n';
625  }
626  if (capture_raw) AddRawLiteralChar(lastChar);
627  } else {
628  bool success = ScanEscape<capture_raw>();
629  USE(success);
630  DCHECK_EQ(!success, has_error());
631  // For templates, invalid escape sequence checking is handled in the
632  // parser.
633  scanner_error_state.MoveErrorTo(next_);
634  octal_error_state.MoveErrorTo(next_);
635  }
636  } else if (c < 0) {
637  // Unterminated template literal
638  break;
639  } else {
640  Advance(); // Consume c.
641  // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
642  // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
643  // consisting of the CV 0x000A.
644  if (c == '\r') {
645  if (c0_ == '\n') Advance(); // Consume '\n'
646  c = '\n';
647  }
648  if (capture_raw) AddRawLiteralChar(c);
649  AddLiteralChar(c);
650  }
651  }
652  next().location.end_pos = source_pos();
653  next().token = result;
654 
655  return result;
656 }
657 
658 Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
659  Handle<String> tmp;
660  if (source_url_.length() > 0) {
661  tmp = source_url_.Internalize(isolate);
662  }
663  return tmp;
664 }
665 
666 Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
667  Handle<String> tmp;
668  if (source_mapping_url_.length() > 0) {
669  tmp = source_mapping_url_.Internalize(isolate);
670  }
671  return tmp;
672 }
673 
674 bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
675  bool is_check_first_digit) {
676  // we must have at least one digit after 'x'/'b'/'o'
677  if (is_check_first_digit && !predicate(c0_)) return false;
678 
679  bool separator_seen = false;
680  while (predicate(c0_) || c0_ == '_') {
681  if (c0_ == '_') {
682  Advance();
683  if (c0_ == '_') {
684  ReportScannerError(Location(source_pos(), source_pos() + 1),
685  MessageTemplate::kContinuousNumericSeparator);
686  return false;
687  }
688  separator_seen = true;
689  continue;
690  }
691  separator_seen = false;
692  AddLiteralCharAdvance();
693  }
694 
695  if (separator_seen) {
696  ReportScannerError(Location(source_pos(), source_pos() + 1),
697  MessageTemplate::kTrailingNumericSeparator);
698  return false;
699  }
700 
701  return true;
702 }
703 
704 bool Scanner::ScanDecimalDigits() {
705  if (allow_harmony_numeric_separator()) {
706  return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
707  }
708  while (IsDecimalDigit(c0_)) {
709  AddLiteralCharAdvance();
710  }
711  return true;
712 }
713 
714 bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
715  bool separator_seen = false;
716  while (IsDecimalDigit(c0_) || c0_ == '_') {
717  if (c0_ == '_') {
718  Advance();
719  if (c0_ == '_') {
720  ReportScannerError(Location(source_pos(), source_pos() + 1),
721  MessageTemplate::kContinuousNumericSeparator);
722  return false;
723  }
724  separator_seen = true;
725  continue;
726  }
727  separator_seen = false;
728  *value = 10 * *value + (c0_ - '0');
729  uc32 first_char = c0_;
730  Advance();
731  AddLiteralChar(first_char);
732  }
733 
734  if (separator_seen) {
735  ReportScannerError(Location(source_pos(), source_pos() + 1),
736  MessageTemplate::kTrailingNumericSeparator);
737  return false;
738  }
739 
740  return true;
741 }
742 
743 bool Scanner::ScanDecimalAsSmi(uint64_t* value) {
744  if (allow_harmony_numeric_separator()) {
745  return ScanDecimalAsSmiWithNumericSeparators(value);
746  }
747 
748  while (IsDecimalDigit(c0_)) {
749  *value = 10 * *value + (c0_ - '0');
750  uc32 first_char = c0_;
751  Advance();
752  AddLiteralChar(first_char);
753  }
754  return true;
755 }
756 
757 bool Scanner::ScanBinaryDigits() {
758  if (allow_harmony_numeric_separator()) {
759  return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
760  }
761 
762  // we must have at least one binary digit after 'b'/'B'
763  if (!IsBinaryDigit(c0_)) {
764  return false;
765  }
766 
767  while (IsBinaryDigit(c0_)) {
768  AddLiteralCharAdvance();
769  }
770  return true;
771 }
772 
773 bool Scanner::ScanOctalDigits() {
774  if (allow_harmony_numeric_separator()) {
775  return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
776  }
777 
778  // we must have at least one octal digit after 'o'/'O'
779  if (!IsOctalDigit(c0_)) {
780  return false;
781  }
782 
783  while (IsOctalDigit(c0_)) {
784  AddLiteralCharAdvance();
785  }
786  return true;
787 }
788 
789 bool Scanner::ScanImplicitOctalDigits(int start_pos,
790  Scanner::NumberKind* kind) {
791  *kind = IMPLICIT_OCTAL;
792 
793  while (true) {
794  // (possible) octal number
795  if (IsNonOctalDecimalDigit(c0_)) {
796  *kind = DECIMAL_WITH_LEADING_ZERO;
797  return true;
798  }
799  if (!IsOctalDigit(c0_)) {
800  // Octal literal finished.
801  octal_pos_ = Location(start_pos, source_pos());
802  octal_message_ = MessageTemplate::kStrictOctalLiteral;
803  return true;
804  }
805  AddLiteralCharAdvance();
806  }
807 }
808 
809 bool Scanner::ScanHexDigits() {
810  if (allow_harmony_numeric_separator()) {
811  return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
812  }
813 
814  // we must have at least one hex digit after 'x'/'X'
815  if (!IsHexDigit(c0_)) {
816  return false;
817  }
818 
819  while (IsHexDigit(c0_)) {
820  AddLiteralCharAdvance();
821  }
822  return true;
823 }
824 
825 bool Scanner::ScanSignedInteger() {
826  if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
827  // we must have at least one decimal digit after 'e'/'E'
828  if (!IsDecimalDigit(c0_)) return false;
829  return ScanDecimalDigits();
830 }
831 
832 Token::Value Scanner::ScanNumber(bool seen_period) {
833  DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
834 
835  NumberKind kind = DECIMAL;
836 
837  next().literal_chars.Start();
838  bool at_start = !seen_period;
839  int start_pos = source_pos(); // For reporting octal positions.
840  if (seen_period) {
841  // we have already seen a decimal point of the float
842  AddLiteralChar('.');
843  if (allow_harmony_numeric_separator() && c0_ == '_') {
844  return Token::ILLEGAL;
845  }
846  // we know we have at least one digit
847  if (!ScanDecimalDigits()) return Token::ILLEGAL;
848  } else {
849  // if the first character is '0' we must check for octals and hex
850  if (c0_ == '0') {
851  AddLiteralCharAdvance();
852 
853  // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
854  // an octal number.
855  if (c0_ == 'x' || c0_ == 'X') {
856  AddLiteralCharAdvance();
857  kind = HEX;
858  if (!ScanHexDigits()) return Token::ILLEGAL;
859  } else if (c0_ == 'o' || c0_ == 'O') {
860  AddLiteralCharAdvance();
861  kind = OCTAL;
862  if (!ScanOctalDigits()) return Token::ILLEGAL;
863  } else if (c0_ == 'b' || c0_ == 'B') {
864  AddLiteralCharAdvance();
865  kind = BINARY;
866  if (!ScanBinaryDigits()) return Token::ILLEGAL;
867  } else if (IsOctalDigit(c0_)) {
868  kind = IMPLICIT_OCTAL;
869  if (!ScanImplicitOctalDigits(start_pos, &kind)) {
870  return Token::ILLEGAL;
871  }
872  if (kind == DECIMAL_WITH_LEADING_ZERO) {
873  at_start = false;
874  }
875  } else if (IsNonOctalDecimalDigit(c0_)) {
876  kind = DECIMAL_WITH_LEADING_ZERO;
877  } else if (allow_harmony_numeric_separator() && c0_ == '_') {
878  ReportScannerError(Location(source_pos(), source_pos() + 1),
879  MessageTemplate::kZeroDigitNumericSeparator);
880  return Token::ILLEGAL;
881  }
882  }
883 
884  // Parse decimal digits and allow trailing fractional part.
885  if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
886  // This is an optimization for parsing Decimal numbers as Smi's.
887  if (at_start) {
888  uint64_t value = 0;
889  // scan subsequent decimal digits
890  if (!ScanDecimalAsSmi(&value)) {
891  return Token::ILLEGAL;
892  }
893 
894  if (next().literal_chars.one_byte_literal().length() <= 10 &&
895  value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
896  next().smi_value_ = static_cast<uint32_t>(value);
897 
898  if (kind == DECIMAL_WITH_LEADING_ZERO) {
899  octal_pos_ = Location(start_pos, source_pos());
900  octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
901  }
902  return Token::SMI;
903  }
904  }
905 
906  if (!ScanDecimalDigits()) return Token::ILLEGAL;
907  if (c0_ == '.') {
908  seen_period = true;
909  AddLiteralCharAdvance();
910  if (allow_harmony_numeric_separator() && c0_ == '_') {
911  return Token::ILLEGAL;
912  }
913  if (!ScanDecimalDigits()) return Token::ILLEGAL;
914  }
915  }
916  }
917 
918  bool is_bigint = false;
919  if (c0_ == 'n' && !seen_period &&
920  (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) {
921  // Check that the literal is within our limits for BigInt length.
922  // For simplicity, use 4 bits per character to calculate the maximum
923  // allowed literal length.
924  static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
925  int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
926  if (length > kMaxBigIntCharacters) {
927  ReportScannerError(Location(start_pos, source_pos()),
928  MessageTemplate::kBigIntTooBig);
929  return Token::ILLEGAL;
930  }
931 
932  is_bigint = true;
933  Advance();
934  } else if (c0_ == 'e' || c0_ == 'E') {
935  // scan exponent, if any
936  DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
937 
938  if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
939  return Token::ILLEGAL;
940 
941  // scan exponent
942  AddLiteralCharAdvance();
943 
944  if (!ScanSignedInteger()) return Token::ILLEGAL;
945  }
946 
947  // The source character immediately following a numeric literal must
948  // not be an identifier start or a decimal digit; see ECMA-262
949  // section 7.8.3, page 17 (note that we read only one decimal digit
950  // if the value is 0).
951  if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
952  return Token::ILLEGAL;
953  }
954 
955  if (kind == DECIMAL_WITH_LEADING_ZERO) {
956  octal_pos_ = Location(start_pos, source_pos());
957  octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
958  }
959 
960  return is_bigint ? Token::BIGINT : Token::NUMBER;
961 }
962 
963 uc32 Scanner::ScanIdentifierUnicodeEscape() {
964  Advance();
965  if (c0_ != 'u') return -1;
966  Advance();
967  return ScanUnicodeEscape<false>();
968 }
969 
970 template <bool capture_raw>
971 uc32 Scanner::ScanUnicodeEscape() {
972  // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
973  // hex digits between { } is arbitrary. \ and u have already been read.
974  if (c0_ == '{') {
975  int begin = source_pos() - 2;
976  Advance<capture_raw>();
977  uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin);
978  if (cp < 0 || c0_ != '}') {
979  ReportScannerError(source_pos(),
980  MessageTemplate::kInvalidUnicodeEscapeSequence);
981  return -1;
982  }
983  Advance<capture_raw>();
984  return cp;
985  }
986  const bool unicode = true;
987  return ScanHexNumber<capture_raw, unicode>(4);
988 }
989 
990 Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
991  bool can_be_keyword) {
992  while (true) {
993  if (c0_ == '\\') {
994  escaped = true;
995  uc32 c = ScanIdentifierUnicodeEscape();
996  // Only allow legal identifier part characters.
997  // TODO(verwaest): Make this true.
998  // DCHECK(!IsIdentifierPart('\'));
999  DCHECK(!IsIdentifierPart(-1));
1000  if (c == '\\' || !IsIdentifierPart(c)) {
1001  return Token::ILLEGAL;
1002  }
1003  can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
1004  AddLiteralChar(c);
1005  } else if (IsIdentifierPart(c0_) ||
1006  (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
1007  can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
1008  AddLiteralCharAdvance();
1009  } else {
1010  break;
1011  }
1012  }
1013 
1014  if (can_be_keyword && next().literal_chars.is_one_byte()) {
1015  Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
1016  Token::Value token =
1017  KeywordOrIdentifierToken(chars.start(), chars.length());
1018  /* TODO(adamk): YIELD should be handled specially. */
1019  if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
1020  if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
1021  return token;
1022  }
1023  if (token == Token::IDENTIFIER) return token;
1024 
1025  if (!escaped) return token;
1026 
1027  if (token == Token::LET || token == Token::STATIC) {
1028  return Token::ESCAPED_STRICT_RESERVED_WORD;
1029  }
1030  return Token::ESCAPED_KEYWORD;
1031  }
1032 
1033  return Token::IDENTIFIER;
1034 }
1035 
1036 bool Scanner::ScanRegExpPattern() {
1037  DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
1038  DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
1039 
1040  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1041  bool in_character_class = false;
1042 
1043  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1044  // the scanner should pass uninterpreted bodies to the RegExp
1045  // constructor.
1046  next().literal_chars.Start();
1047  if (next().token == Token::ASSIGN_DIV) {
1048  AddLiteralChar('=');
1049  }
1050 
1051  while (c0_ != '/' || in_character_class) {
1052  if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1053  return false;
1054  }
1055  if (c0_ == '\\') { // Escape sequence.
1056  AddLiteralCharAdvance();
1057  if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1058  return false;
1059  }
1060  AddLiteralCharAdvance();
1061  // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1062  // only "safe" characters are allowed (letters, digits, underscore),
1063  // otherwise the escape isn't valid and the invalid character has
1064  // its normal meaning. I.e., we can just continue scanning without
1065  // worrying whether the following characters are part of the escape
1066  // or not, since any '/', '\\' or '[' is guaranteed to not be part
1067  // of the escape sequence.
1068 
1069  // TODO(896): At some point, parse RegExps more thoroughly to capture
1070  // octal esacpes in strict mode.
1071  } else { // Unescaped character.
1072  if (c0_ == '[') in_character_class = true;
1073  if (c0_ == ']') in_character_class = false;
1074  AddLiteralCharAdvance();
1075  }
1076  }
1077  Advance(); // consume '/'
1078 
1079  next().token = Token::REGEXP_LITERAL;
1080  return true;
1081 }
1082 
1083 
1084 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1085  DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
1086 
1087  // Scan regular expression flags.
1088  int flags = 0;
1089  while (IsIdentifierPart(c0_)) {
1090  RegExp::Flags flag = RegExp::kNone;
1091  switch (c0_) {
1092  case 'g':
1093  flag = RegExp::kGlobal;
1094  break;
1095  case 'i':
1096  flag = RegExp::kIgnoreCase;
1097  break;
1098  case 'm':
1099  flag = RegExp::kMultiline;
1100  break;
1101  case 's':
1102  flag = RegExp::kDotAll;
1103  break;
1104  case 'u':
1105  flag = RegExp::kUnicode;
1106  break;
1107  case 'y':
1108  flag = RegExp::kSticky;
1109  break;
1110  default:
1111  return Nothing<RegExp::Flags>();
1112  }
1113  if (flags & flag) {
1114  return Nothing<RegExp::Flags>();
1115  }
1116  Advance();
1117  flags |= flag;
1118  }
1119 
1120  next().location.end_pos = source_pos();
1121  return Just(RegExp::Flags(flags));
1122 }
1123 
1124 const AstRawString* Scanner::CurrentSymbol(
1125  AstValueFactory* ast_value_factory) const {
1126  if (is_literal_one_byte()) {
1127  return ast_value_factory->GetOneByteString(literal_one_byte_string());
1128  }
1129  return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1130 }
1131 
1132 const AstRawString* Scanner::NextSymbol(
1133  AstValueFactory* ast_value_factory) const {
1134  if (is_next_literal_one_byte()) {
1135  return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1136  }
1137  return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1138 }
1139 
1140 const AstRawString* Scanner::CurrentRawSymbol(
1141  AstValueFactory* ast_value_factory) const {
1142  if (is_raw_literal_one_byte()) {
1143  return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1144  }
1145  return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1146 }
1147 
1148 
1149 double Scanner::DoubleValue() {
1150  DCHECK(is_literal_one_byte());
1151  return StringToDouble(
1152  literal_one_byte_string(),
1153  ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1154 }
1155 
1156 const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1157  DCHECK(is_literal_one_byte());
1158  Vector<const uint8_t> vector = literal_one_byte_string();
1159  int length = vector.length();
1160  char* buffer = zone->NewArray<char>(length + 1);
1161  memcpy(buffer, vector.start(), length);
1162  buffer[length] = '\0';
1163  return buffer;
1164 }
1165 
1166 void Scanner::SeekNext(size_t position) {
1167  // Use with care: This cleanly resets most, but not all scanner state.
1168  // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1169 
1170  // To re-scan from a given character position, we need to:
1171  // 1, Reset the current_, next_ and next_next_ tokens
1172  // (next_ + next_next_ will be overwrittem by Next(),
1173  // current_ will remain unchanged, so overwrite it fully.)
1174  for (TokenDesc& token : token_storage_) {
1175  token.token = Token::UNINITIALIZED;
1176  token.invalid_template_escape_message = MessageTemplate::kNone;
1177  }
1178  // 2, reset the source to the desired position,
1179  source_->Seek(position);
1180  // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1181  c0_ = source_->Advance();
1182  next().after_line_terminator = false;
1183  Scan();
1184  DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1185 }
1186 
1187 } // namespace internal
1188 } // namespace v8
Definition: libplatform.h:13