V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
earley-parser.cc
1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <algorithm>
6 #include <set>
7 #include <unordered_map>
8 #include <unordered_set>
9 
10 #include "src/torque/ast.h"
11 #include "src/torque/earley-parser.h"
12 #include "src/torque/utils.h"
13 
14 namespace v8 {
15 namespace internal {
16 namespace torque {
17 
18 namespace {
19 
20 void UpdateSourcePosition(InputPosition from, InputPosition to,
21  SourcePosition* pos) {
22  while (from != to) {
23  if (*from == '\n') {
24  pos->line += 1;
25  pos->column = 0;
26  } else {
27  pos->column += 1;
28  }
29  ++from;
30  }
31 }
32 
33 } // namespace
34 
35 base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
36  const LexerResult& tokens) const {
37  std::vector<ParseResult> results;
38  for (const Item* child : completed_item->Children()) {
39  if (!child) continue;
40  base::Optional<ParseResult> child_result =
41  child->left()->RunAction(child, tokens);
42  if (child_result) results.push_back(std::move(*child_result));
43  }
44  MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
45  CurrentSourcePosition::Scope pos_scope(matched_input.pos);
46  ParseResultIterator iterator(std::move(results), matched_input);
47  return action_(&iterator);
48 }
49 
50 Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
51  rules_.clear();
52  for (const Rule& rule : rules) {
53  AddRule(rule);
54  }
55  return *this;
56 }
57 
58 std::vector<const Item*> Item::Children() const {
59  std::vector<const Item*> children;
60  for (const Item* current = this; current->prev_; current = current->prev_) {
61  children.push_back(current->child_);
62  }
63  // The above loop collects the child nodes in reversed order.
64  std::reverse(children.begin(), children.end());
65  DCHECK_EQ(children.size(), right().size());
66  return children;
67 }
68 
69 std::string Item::SplitByChildren(const LexerResult& tokens) const {
70  if (right().size() == 1) {
71  if (const Item* child = Children()[0])
72  return child->SplitByChildren(tokens);
73  }
74  std::stringstream s;
75  bool first = true;
76  for (const Item* item : Children()) {
77  if (!item) continue;
78  if (!first) s << " ";
79  s << item->GetMatchedInput(tokens).ToString();
80  first = false;
81  }
82  return s.str();
83 }
84 
85 void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
86  DCHECK(*this == other);
87  if (child_ != other.child_) {
88  std::stringstream s;
89  s << "Ambiguous grammer rules for \""
90  << child_->GetMatchedInput(tokens).ToString() << "\":\n "
91  << child_->SplitByChildren(tokens) << "\nvs\n "
92  << other.child_->SplitByChildren(tokens);
93  ReportError(s.str());
94  }
95  if (prev_ != other.prev_) {
96  std::stringstream s;
97  s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
98  << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n "
99  << other.SplitByChildren(tokens) << " ...";
100  ReportError(s.str());
101  }
102 }
103 
104 LexerResult Lexer::RunLexer(const std::string& input) {
105  LexerResult result;
106  InputPosition const begin = input.c_str();
107  InputPosition const end = begin + input.size();
108  InputPosition pos = begin;
109  InputPosition token_start = pos;
110  CurrentSourcePosition::Scope scope(
111  SourcePosition{CurrentSourceFile::Get(), 0, 0});
112  match_whitespace_(&pos);
113  while (pos != end) {
114  UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
115  token_start = pos;
116  Symbol* symbol = MatchToken(&pos, end);
117  if (!symbol) {
118  ReportError("Lexer Error: unknown token " +
119  StringLiteralQuote(std::string(
120  token_start, token_start + std::min<ptrdiff_t>(
121  end - token_start, 10))));
122  }
123  result.token_symbols.push_back(symbol);
124  result.token_contents.push_back(
125  {token_start, pos, CurrentSourcePosition::Get()});
126  match_whitespace_(&pos);
127  }
128  UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
129  // Add an additional token position to simplify corner cases.
130  result.token_contents.push_back({pos, pos, CurrentSourcePosition::Get()});
131  return result;
132 }
133 
134 Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
135  InputPosition token_start = *pos;
136  Symbol* symbol = nullptr;
137  // Find longest matching pattern.
138  for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
139  InputPosition token_end = token_start;
140  PatternFunction matchPattern = pair.first;
141  if (matchPattern(&token_end) && token_end > *pos) {
142  *pos = token_end;
143  symbol = &pair.second;
144  }
145  }
146  // Check if matched pattern coincides with a keyword. Prefer the keyword in
147  // this case.
148  if (*pos != token_start) {
149  auto found_keyword = keywords_.find(std::string(token_start, *pos));
150  if (found_keyword != keywords_.end()) {
151  return &found_keyword->second;
152  }
153  return symbol;
154  }
155  // Now check for a keyword (that doesn't overlap with a pattern).
156  // Iterate from the end to ensure that if one keyword is a prefix of another,
157  // we first try to match the longer one.
158  for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
159  const std::string& keyword = it->first;
160  if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
161  if (keyword == std::string(*pos, *pos + keyword.size())) {
162  *pos += keyword.size();
163  return &it->second;
164  }
165  }
166  return nullptr;
167 }
168 
169 // This is an implementation of Earley's parsing algorithm
170 // (https://en.wikipedia.org/wiki/Earley_parser).
171 const Item* RunEarleyAlgorithm(
172  Symbol* start, const LexerResult& tokens,
173  std::unordered_set<Item, base::hash<Item>>* processed) {
174  // Worklist for items at the current position.
175  std::vector<Item> worklist;
176  // Worklist for items at the next position.
177  std::vector<Item> future_items;
178  CurrentSourcePosition::Scope source_position(
179  SourcePosition{CurrentSourceFile::Get(), 0, 0});
180  std::vector<const Item*> completed_items;
181  std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
182  base::hash<std::pair<size_t, Symbol*>>>
183  waiting;
184 
185  std::vector<const Item*> debug_trace;
186 
187  // Start with one top_level symbol mapping to the start symbol of the grammar.
188  // This simplifies things because the start symbol might have several
189  // rules.
190  Symbol top_level;
191  top_level.AddRule(Rule({start}));
192  worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
193 
194  size_t input_length = tokens.token_symbols.size();
195 
196  for (size_t pos = 0; pos <= input_length; ++pos) {
197  while (!worklist.empty()) {
198  auto insert_result = processed->insert(worklist.back());
199  const Item& item = *insert_result.first;
200  DCHECK_EQ(pos, item.pos());
201  MatchedInput last_token = tokens.token_contents[pos];
202  CurrentSourcePosition::Get() = last_token.pos;
203  bool is_new = insert_result.second;
204  if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
205  worklist.pop_back();
206  if (!is_new) continue;
207 
208  debug_trace.push_back(&item);
209  if (item.IsComplete()) {
210  // 'Complete' phase: Advance all items that were waiting to match this
211  // symbol next.
212  for (const Item* parent : waiting[{item.start(), item.left()}]) {
213  worklist.push_back(parent->Advance(pos, &item));
214  }
215  } else {
216  Symbol* next = item.NextSymbol();
217  // 'Scan' phase: Check if {next} is the next symbol in the input (this
218  // is never the case if {next} is a non-terminal).
219  if (pos < tokens.token_symbols.size() &&
220  tokens.token_symbols[pos] == next) {
221  future_items.push_back(item.Advance(pos + 1, nullptr));
222  }
223  // 'Predict' phase: Add items for every rule of the non-terminal.
224  if (!next->IsTerminal()) {
225  // Remember that this item is waiting for completion with {next}.
226  waiting[{pos, next}].insert(&item);
227  }
228  for (size_t i = 0; i < next->rule_number(); ++i) {
229  Rule* rule = next->rule(i);
230  auto already_completed =
231  processed->find(Item{rule, rule->right().size(), pos, pos});
232  // As discussed in section 3 of
233  // Aycock, John, and R. Nigel Horspool. "Practical earley
234  // parsing." The Computer Journal 45.6 (2002): 620-630.
235  // Earley parsing has the following problem with epsilon rules:
236  // When we complete an item that started at the current position
237  // (that is, it matched zero tokens), we might not yet have
238  // predicted all items it can complete with. Thus we check for the
239  // existence of such items here and complete them immediately.
240  if (already_completed != processed->end()) {
241  worklist.push_back(item.Advance(pos, &*already_completed));
242  } else {
243  worklist.push_back(Item{rule, 0, pos, pos});
244  }
245  }
246  }
247  }
248  std::swap(worklist, future_items);
249  }
250 
251  auto final_item =
252  processed->find(Item{top_level.rule(0), 1, 0, input_length});
253  if (final_item != processed->end()) {
254  // Success: The {top_level} rule matches the complete input.
255  return final_item->Children()[0];
256  }
257  std::string reason;
258  const Item& last_item = *debug_trace.back();
259  if (last_item.pos() < tokens.token_symbols.size()) {
260  std::string next_token = tokens.token_contents[last_item.pos()].ToString();
261  reason = "unexpected token \"" + next_token + "\"";
262  } else {
263  reason = "unexpected end of input";
264  }
265  ReportError("Parser Error: " + reason);
266 }
267 
268 // static
269 bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
270  if (**pos && char_class(static_cast<unsigned char>(**pos))) {
271  ++*pos;
272  return true;
273  }
274  return false;
275 }
276 
277 // static
278 bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
279  if (**pos && char_class(**pos)) {
280  ++*pos;
281  return true;
282  }
283  return false;
284 }
285 
286 // static
287 bool Grammar::MatchString(const char* s, InputPosition* pos) {
288  InputPosition current = *pos;
289  for (; *s != 0; ++s, ++current) {
290  if (*s != *current) return false;
291  }
292  *pos = current;
293  return true;
294 }
295 
296 // static
297 bool Grammar::MatchAnyChar(InputPosition* pos) {
298  return MatchChar([](char c) { return true; }, pos);
299 }
300 
301 } // namespace torque
302 } // namespace internal
303 } // namespace v8
Definition: libplatform.h:13