1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_TORQUE_EARLEY_PARSER_H_
6 #define V8_TORQUE_EARLEY_PARSER_H_
7 
8 #include <map>
9 #include <vector>
10 
11 #include "src/base/optional.h"
12 #include "src/torque/contextual.h"
13 #include "src/torque/source-positions.h"
14 #include "src/torque/utils.h"
15 
16 namespace v8 {
17 namespace internal {
18 namespace torque {
19 
20 class Symbol;
21 class Item;
22 
23 class ParseResultHolderBase {
24  public:
25   enum class TypeId;
26   virtual ~ParseResultHolderBase() = default;
27   template <class T>
28   T& Cast();
29   template <class T>
30   const T& Cast() const;
31 
32  protected:
ParseResultHolderBase(TypeId type_id)33   explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
34     // MSVC wrongly complains about type_id_ being an unused private field.
35     USE(type_id_);
36   }
37 
38  private:
39   const TypeId type_id_;
40 };
41 
42 using ParseResultTypeId = ParseResultHolderBase::TypeId;
43 
44 template <class T>
45 class ParseResultHolder : public ParseResultHolderBase {
46  public:
ParseResultHolder(T value)47   explicit ParseResultHolder(T value)
48       : ParseResultHolderBase(id), value_(std::move(value)) {}
49 
50  private:
51   V8_EXPORT_PRIVATE static const TypeId id;
52   friend class ParseResultHolderBase;
53   T value_;
54 };
55 
56 template <class T>
Cast()57 T& ParseResultHolderBase::Cast() {
58   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
59   return static_cast<ParseResultHolder<T>*>(this)->value_;
60 }
61 
62 template <class T>
Cast()63 const T& ParseResultHolderBase::Cast() const {
64   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
65   return static_cast<const ParseResultHolder<T>*>(this)->value_;
66 }
67 
68 class ParseResult {
69  public:
70   template <class T>
ParseResult(T x)71   explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
72 
73   template <class T>
Cast()74   const T& Cast() const {
75     return value_->Cast<T>();
76   }
77   template <class T>
Cast()78   T& Cast() {
79     return value_->Cast<T>();
80   }
81 
82  private:
83   std::unique_ptr<ParseResultHolderBase> value_;
84 };
85 
86 using InputPosition = const char*;
87 
88 struct MatchedInput {
MatchedInputMatchedInput89   MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
90       : begin(begin), end(end), pos(pos) {}
91   InputPosition begin;
92   InputPosition end;
93   SourcePosition pos;
ToStringMatchedInput94   std::string ToString() const { return {begin, end}; }
95 };
96 
97 class ParseResultIterator {
98  public:
ParseResultIterator(std::vector<ParseResult> results,MatchedInput matched_input)99   explicit ParseResultIterator(std::vector<ParseResult> results,
100                                MatchedInput matched_input)
101       : results_(std::move(results)), matched_input_(matched_input) {}
~ParseResultIterator()102   ~ParseResultIterator() {
103     // Check that all parse results have been used.
104     CHECK_EQ(results_.size(), i_);
105   }
106 
Next()107   ParseResult Next() {
108     CHECK_LT(i_, results_.size());
109     return std::move(results_[i_++]);
110   }
111   template <class T>
NextAs()112   T NextAs() {
113     return std::move(Next().Cast<T>());
114   }
HasNext()115   bool HasNext() const { return i_ < results_.size(); }
116 
matched_input()117   const MatchedInput& matched_input() const { return matched_input_; }
118 
119  private:
120   std::vector<ParseResult> results_;
121   size_t i_ = 0;
122   MatchedInput matched_input_;
123 
124   DISALLOW_COPY_AND_MOVE_AND_ASSIGN(ParseResultIterator);
125 };
126 
127 struct LexerResult {
128   std::vector<Symbol*> token_symbols;
129   std::vector<MatchedInput> token_contents;
130 };
131 
132 using Action =
133     base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
134 
DefaultAction(ParseResultIterator * child_results)135 inline base::Optional<ParseResult> DefaultAction(
136     ParseResultIterator* child_results) {
137   if (!child_results->HasNext()) return base::nullopt;
138   return child_results->Next();
139 }
140 
141 // A rule of the context-free grammar. Each rule can have an action attached to
142 // it, which is executed after the parsing is finished.
143 class Rule final {
144  public:
145   explicit Rule(std::vector<Symbol*> right_hand_side,
146                 Action action = DefaultAction)
right_hand_side_(std::move (right_hand_side))147       : right_hand_side_(std::move(right_hand_side)), action_(action) {}
148 
left()149   Symbol* left() const {
150     DCHECK_NOT_NULL(left_hand_side_);
151     return left_hand_side_;
152   }
right()153   const std::vector<Symbol*>& right() const { return right_hand_side_; }
154 
SetLeftHandSide(Symbol * left_hand_side)155   void SetLeftHandSide(Symbol* left_hand_side) {
156     DCHECK_NULL(left_hand_side_);
157     left_hand_side_ = left_hand_side;
158   }
159 
160   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
161       const Item* completed_item, const LexerResult& tokens) const;
162 
163  private:
164   Symbol* left_hand_side_ = nullptr;
165   std::vector<Symbol*> right_hand_side_;
166   Action action_;
167 };
168 
169 // A Symbol represents a terminal or a non-terminal of the grammar.
170 // It stores the list of rules, which have this symbol as the
171 // left-hand side.
172 // Terminals have an empty list of rules, they are created by the Lexer
173 // instead of from rules.
174 // Symbols need to reside at stable memory addresses, because the addresses are
175 // used in the parser.
176 class Symbol {
177  public:
Symbol()178   Symbol() : Symbol({}) {}
Symbol(std::initializer_list<Rule> rules)179   Symbol(std::initializer_list<Rule> rules) { *this = rules; }
180 
181   V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
182 
IsTerminal()183   bool IsTerminal() const { return rules_.empty(); }
rule(size_t index)184   Rule* rule(size_t index) const { return rules_[index].get(); }
rule_number()185   size_t rule_number() const { return rules_.size(); }
186 
AddRule(const Rule & rule)187   void AddRule(const Rule& rule) {
188     rules_.push_back(base::make_unique<Rule>(rule));
189     rules_.back()->SetLeftHandSide(this);
190   }
191 
192   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
193       const Item* item, const LexerResult& tokens);
194 
195  private:
196   std::vector<std::unique_ptr<Rule>> rules_;
197 
198   // Disallow copying and moving to ensure Symbol has a stable address.
199   DISALLOW_COPY_AND_MOVE_AND_ASSIGN(Symbol);
200 };
201 
202 // Items are the core datastructure of Earley's algorithm.
203 // They consist of a (partially) matched rule, a marked position inside of the
204 // right-hand side of the rule (traditionally written as a dot) and an input
205 // range from {start} to {pos} that matches the symbols of the right-hand side
206 // that are left of the mark. In addition, they store a child and a left-sibling
207 // pointer to reconstruct the AST in the end.
208 class Item {
209  public:
Item(const Rule * rule,size_t mark,size_t start,size_t pos)210   Item(const Rule* rule, size_t mark, size_t start, size_t pos)
211       : rule_(rule), mark_(mark), start_(start), pos_(pos) {
212     DCHECK_LE(mark_, right().size());
213   }
214 
215   // A complete item has the mark at the right end, which means the input range
216   // matches the complete rule.
IsComplete()217   bool IsComplete() const {
218     DCHECK_LE(mark_, right().size());
219     return mark_ == right().size();
220   }
221 
222   // The symbol right after the mark is expected at {pos} for this item to
223   // advance.
NextSymbol()224   Symbol* NextSymbol() const {
225     DCHECK(!IsComplete());
226     DCHECK_LT(mark_, right().size());
227     return right()[mark_];
228   }
229 
230   // We successfully parsed NextSymbol() between {pos} and {new_pos}.
231   // If NextSymbol() was a non-terminal, then {child} is a pointer to a
232   // completed item for this parse.
233   // We create a new item, which moves the mark one forward.
234   Item Advance(size_t new_pos, const Item* child = nullptr) const {
235     if (child) {
236       DCHECK(child->IsComplete());
237       DCHECK_EQ(pos(), child->start());
238       DCHECK_EQ(new_pos, child->pos());
239       DCHECK_EQ(NextSymbol(), child->left());
240     }
241     Item result(rule_, mark_ + 1, start_, new_pos);
242     result.prev_ = this;
243     result.child_ = child;
244     return result;
245   }
246 
247   // Collect the items representing the AST children of this completed item.
248   std::vector<const Item*> Children() const;
249   // The matched input separated according to the next branching AST level.
250   std::string SplitByChildren(const LexerResult& tokens) const;
251   // Check if {other} results in the same AST as this Item.
252   void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
253 
GetMatchedInput(const LexerResult & tokens)254   MatchedInput GetMatchedInput(const LexerResult& tokens) const {
255     return {tokens.token_contents[start_].begin,
256             start_ == pos_ ? tokens.token_contents[start_].begin
257                            : tokens.token_contents[pos_ - 1].end,
258             tokens.token_contents[start_].pos};
259   }
260 
261   // We exclude {prev_} and {child_} from equality and hash computations,
262   // because they are just globally unique data associated with an item.
263   bool operator==(const Item& other) const {
264     return rule_ == other.rule_ && mark_ == other.mark_ &&
265            start_ == other.start_ && pos_ == other.pos_;
266   }
267 
hash_value(const Item & i)268   friend size_t hash_value(const Item& i) {
269     return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
270   }
271 
rule()272   const Rule* rule() const { return rule_; }
left()273   Symbol* left() const { return rule_->left(); }
right()274   const std::vector<Symbol*>& right() const { return rule_->right(); }
pos()275   size_t pos() const { return pos_; }
start()276   size_t start() const { return start_; }
277 
278  private:
279   const Rule* rule_;
280   size_t mark_;
281   size_t start_;
282   size_t pos_;
283 
284   const Item* prev_ = nullptr;
285   const Item* child_ = nullptr;
286 };
287 
RunAction(const Item * item,const LexerResult & tokens)288 inline base::Optional<ParseResult> Symbol::RunAction(
289     const Item* item, const LexerResult& tokens) {
290   DCHECK(item->IsComplete());
291   DCHECK_EQ(item->left(), this);
292   return item->rule()->RunAction(item, tokens);
293 }
294 
295 V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
296     Symbol* start, const LexerResult& tokens,
297     std::unordered_set<Item, base::hash<Item>>* processed);
298 
ParseTokens(Symbol * start,const LexerResult & tokens)299 inline base::Optional<ParseResult> ParseTokens(Symbol* start,
300                                                const LexerResult& tokens) {
301   std::unordered_set<Item, base::hash<Item>> table;
302   const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
303   return start->RunAction(final_item, tokens);
304 }
305 
306 // The lexical syntax is dynamically defined while building the grammar by
307 // adding patterns and keywords to the Lexer.
308 // The term keyword here can stand for any fixed character sequence, including
309 // operators and parentheses.
310 // Each pattern or keyword automatically gets a terminal symbol associated with
311 // it. These symbols form the result of the lexing.
312 // Patterns and keywords are matched using the longest match principle. If the
313 // longest matching pattern coincides with a keyword, the keyword symbol is
314 // chosen instead of the pattern.
315 // In addition, there is a single whitespace pattern which is consumed but does
316 // not become part of the token list.
317 class Lexer {
318  public:
319   // Functions to define patterns. They try to match starting from {pos}. If
320   // successful, they return true and advance {pos}. Otherwise, {pos} stays
321   // unchanged.
322   using PatternFunction = bool (*)(InputPosition* pos);
323 
SetWhitespace(PatternFunction whitespace)324   void SetWhitespace(PatternFunction whitespace) {
325     match_whitespace_ = whitespace;
326   }
327 
Pattern(PatternFunction pattern)328   Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
Token(const std::string & keyword)329   Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
330   V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
331 
332  private:
333   PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
334   std::map<PatternFunction, Symbol> patterns_;
335   std::map<std::string, Symbol> keywords_;
336   Symbol* MatchToken(InputPosition* pos, InputPosition end);
337 };
338 
339 // A grammar can have a result, which is the results of the start symbol.
340 // Grammar is intended to be subclassed, with Symbol members forming the
341 // mutually recursive rules of the grammar.
342 class Grammar {
343  public:
344   using PatternFunction = Lexer::PatternFunction;
345 
Grammar(Symbol * start)346   explicit Grammar(Symbol* start) : start_(start) {}
347 
Parse(const std::string & input)348   base::Optional<ParseResult> Parse(const std::string& input) {
349     LexerResult tokens = lexer().RunLexer(input);
350     return ParseTokens(start_, tokens);
351   }
352 
353  protected:
Token(const std::string & s)354   Symbol* Token(const std::string& s) { return lexer_.Token(s); }
Pattern(PatternFunction pattern)355   Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
SetWhitespace(PatternFunction ws)356   void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
357 
358   // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
359   // This is necessary to define helpers that create new symbols.
360   Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
361     Symbol* result = new Symbol(rules);
362     generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
363     return result;
364   }
365 
366   // Helper functions to define lexer patterns. If they match, they return true
367   // and advance {pos}. Otherwise, {pos} is unchanged.
368   V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
369                                           InputPosition* pos);
370   V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
371                                           InputPosition* pos);
372   V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
373   V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
374 
375   // The action MatchInput() produces the input matched by the rule as
376   // result.
YieldMatchedInput(ParseResultIterator * child_results)377   static base::Optional<ParseResult> YieldMatchedInput(
378       ParseResultIterator* child_results) {
379     return ParseResult{child_results->matched_input().ToString()};
380   }
381 
382   // Create a new symbol to parse the given sequence of symbols.
383   // At most one of the symbols can return a result.
Sequence(std::vector<Symbol * > symbols)384   Symbol* Sequence(std::vector<Symbol*> symbols) {
385     return NewSymbol({Rule(std::move(symbols))});
386   }
387 
388   template <class T, T value>
YieldIntegralConstant(ParseResultIterator * child_results)389   static base::Optional<ParseResult> YieldIntegralConstant(
390       ParseResultIterator* child_results) {
391     return ParseResult{value};
392   }
393 
394   template <class T>
YieldDefaultValue(ParseResultIterator * child_results)395   static base::Optional<ParseResult> YieldDefaultValue(
396       ParseResultIterator* child_results) {
397     return ParseResult{T{}};
398   }
399 
400   template <class From, class To>
CastParseResult(ParseResultIterator * child_results)401   static base::Optional<ParseResult> CastParseResult(
402       ParseResultIterator* child_results) {
403     To result = std::move(child_results->NextAs<From>());
404     return ParseResult{std::move(result)};
405   }
406 
407   // Try to parse {s} and return the result of type {Result} casted to {T}.
408   // Otherwise, the result is a default-constructed {T}.
409   template <class T, class Result = T>
TryOrDefault(Symbol * s)410   Symbol* TryOrDefault(Symbol* s) {
411     return NewSymbol({Rule({s}, CastParseResult<Result, T>),
412                       Rule({}, YieldDefaultValue<T>)});
413   }
414 
415   template <class T>
MakeSingletonVector(ParseResultIterator * child_results)416   static base::Optional<ParseResult> MakeSingletonVector(
417       ParseResultIterator* child_results) {
418     T x = child_results->NextAs<T>();
419     std::vector<T> result;
420     result.push_back(std::move(x));
421     return ParseResult{std::move(result)};
422   }
423 
424   template <class T>
MakeExtendedVector(ParseResultIterator * child_results)425   static base::Optional<ParseResult> MakeExtendedVector(
426       ParseResultIterator* child_results) {
427     std::vector<T> l = child_results->NextAs<std::vector<T>>();
428     T x = child_results->NextAs<T>();
429     l.push_back(std::move(x));
430     return ParseResult{std::move(l)};
431   }
432 
433   // For example, NonemptyList(Token("A"), Token(",")) parses any of
434   // A or A,A or A,A,A and so on.
435   template <class T>
436   Symbol* NonemptyList(Symbol* element,
437                        base::Optional<Symbol*> separator = {}) {
438     Symbol* list = NewSymbol();
439     *list = {Rule({element}, MakeSingletonVector<T>),
440              separator
441                  ? Rule({list, *separator, element}, MakeExtendedVector<T>)
442                  : Rule({list, element}, MakeExtendedVector<T>)};
443     return list;
444   }
445 
446   template <class T>
447   Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
448     return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
449   }
450 
451   template <class T>
Optional(Symbol * x)452   Symbol* Optional(Symbol* x) {
453     return TryOrDefault<base::Optional<T>, T>(x);
454   }
455 
CheckIf(Symbol * x)456   Symbol* CheckIf(Symbol* x) {
457     return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
458                       Rule({}, YieldIntegralConstant<bool, false>)});
459   }
460 
lexer()461   Lexer& lexer() { return lexer_; }
462 
463  private:
464   Lexer lexer_;
465   std::vector<std::unique_ptr<Symbol>> generated_symbols_;
466   Symbol* start_;
467 };
468 
469 }  // namespace torque
470 }  // namespace internal
471 }  // namespace v8
472 
473 #endif  // V8_TORQUE_EARLEY_PARSER_H_
474