1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 #ifndef NFAtoDFA_DEFINED
8 #define NFAtoDFA_DEFINED
9 
10 #include "src/sksl/lex/DFA.h"
11 #include "src/sksl/lex/DFAState.h"
12 #include "src/sksl/lex/NFA.h"
13 #include "src/sksl/lex/NFAState.h"
14 
15 #include <algorithm>
16 #include <climits>
17 #include <memory>
18 #include <unordered_map>
19 #include <set>
20 #include <vector>
21 
22 /**
23  * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
24  * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
25  * possible combination of simultaneous NFA states and give this combination a label. These labelled
26  * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
27  *
28  * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
29  * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
30  * (in terms of the order in which they were added to the NFA).
31  */
32 class NFAtoDFA {
33 public:
34     static constexpr char START_CHAR = 9;
35     static constexpr char END_CHAR = 126;
36 
NFAtoDFA(NFA * nfa)37     NFAtoDFA(NFA* nfa)
38     : fNFA(*nfa) {}
39 
40     /**
41      * Returns a DFA created from the NFA.
42      */
convert()43     DFA convert() {
44         // create state 0, the "reject" state
45         getState(DFAState::Label({}));
46         // create a state representing being in all of the NFA's start states at once
47         std::vector<int> startStates = fNFA.fStartStates;
48         std::sort(startStates.begin(), startStates.end());
49         // this becomes state 1, our start state
50         DFAState* start = getState(DFAState::Label(startStates));
51         this->scanState(start);
52 
53         this->computeMappings();
54 
55         int stateCount = 0;
56         for (const auto& row : fTransitions) {
57             stateCount = std::max(stateCount, (int) row.size());
58         }
59         return DFA(fCharMappings, fTransitions, fAccepts);
60     }
61 
62 private:
63     /**
64      * Returns an existing state with the given label, or creates a new one and returns it.
65      */
getState(DFAState::Label label)66     DFAState* getState(DFAState::Label label) {
67         auto found = fStates.find(label);
68         if (found == fStates.end()) {
69             int id = fStates.size();
70             fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
71             return fStates[label].get();
72         }
73         return found->second.get();
74     }
75 
add(int nfaState,std::vector<int> * states)76     void add(int nfaState, std::vector<int>* states) {
77         NFAState state = fNFA.fStates[nfaState];
78         if (state.fKind == NFAState::kRemapped_Kind) {
79             for (int next : state.fData) {
80                 this->add(next, states);
81             }
82         } else {
83             for (int state : *states) {
84                 if (nfaState == state) {
85                     return;
86                 }
87             }
88             states->push_back(nfaState);
89         }
90     }
91 
addTransition(char c,int start,int next)92     void addTransition(char c, int start, int next) {
93         while (fTransitions.size() <= (size_t) c) {
94             fTransitions.push_back(std::vector<int>());
95         }
96         std::vector<int>& row = fTransitions[c];
97         while (row.size() <= (size_t) start) {
98             row.push_back(INVALID);
99         }
100         row[start] = next;
101     }
102 
scanState(DFAState * state)103     void scanState(DFAState* state) {
104         state->fIsScanned = true;
105         for (char c = START_CHAR; c <= END_CHAR; ++c) {
106             std::vector<int> next;
107             int bestAccept = INT_MAX;
108             for (int idx : state->fLabel.fStates) {
109                 const NFAState& nfaState = fNFA.fStates[idx];
110                 if (nfaState.accept(c)) {
111                     for (int nextState : nfaState.fNext) {
112                         if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
113                             bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
114                         }
115                         this->add(nextState, &next);
116                     }
117                 }
118             }
119             std::sort(next.begin(), next.end());
120             DFAState* nextState = this->getState(DFAState::Label(next));
121             this->addTransition(c, state->fId, nextState->fId);
122             if (bestAccept != INT_MAX) {
123                 while (fAccepts.size() <= (size_t) nextState->fId) {
124                     fAccepts.push_back(INVALID);
125                 }
126                 fAccepts[nextState->fId] = bestAccept;
127             }
128             if (!nextState->fIsScanned) {
129                 this->scanState(nextState);
130             }
131         }
132     }
133 
134     // collapse rows with the same transitions to a single row. This is common, as each row
135     // represents a character and often there are many characters for which all transitions are
136     // identical (e.g. [0-9] are treated the same way by all lexer rules)
computeMappings()137     void computeMappings() {
138         // mappings[<input row>] = <output row>
139         std::vector<std::vector<int>*> uniques;
140         // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
141         for (size_t i = 0; i < fTransitions.size(); ++i) {
142             int found = -1;
143             for (size_t j = 0; j < uniques.size(); ++j) {
144                 if (*uniques[j] == fTransitions[i]) {
145                     found = j;
146                     break;
147                 }
148             }
149             if (found == -1) {
150                 found = (int) uniques.size();
151                 uniques.push_back(&fTransitions[i]);
152             }
153             fCharMappings.push_back(found);
154         }
155         std::vector<std::vector<int>> newTransitions;
156         for (std::vector<int>* row : uniques) {
157             newTransitions.push_back(*row);
158         }
159         fTransitions = newTransitions;
160     }
161 
162     const NFA& fNFA;
163     std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
164     std::vector<std::vector<int>> fTransitions;
165     std::vector<int> fCharMappings;
166     std::vector<int> fAccepts;
167 };
168 #endif  // NFAtoDFA_DEFINED
169