1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "DFA.h"
9 #include "DFAState.h"
10 #include "NFA.h"
11 #include "NFAState.h"
12 
13 #include <algorithm>
14 #include <climits>
15 #include <memory>
16 #include <unordered_map>
17 #include <set>
18 #include <vector>
19 
20 /**
21  * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
22  * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
23  * possible combination of simultaneous NFA states and give this combination a label. These labelled
24  * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
25  *
26  * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
27  * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
28  * (in terms of the order in which they were added to the NFA).
29  */
30 class NFAtoDFA {
31 public:
32     static constexpr char START_CHAR = 9;
33     static constexpr char END_CHAR = 126;
34 
NFAtoDFA(NFA * nfa)35     NFAtoDFA(NFA* nfa)
36     : fNFA(*nfa) {}
37 
38     /**
39      * Returns a DFA created from the NFA.
40      */
convert()41     DFA convert() {
42         // create state 0, the "reject" state
43         getState(DFAState::Label({}));
44         // create a state representing being in all of the NFA's start states at once
45         std::vector<int> startStates = fNFA.fStartStates;
46         std::sort(startStates.begin(), startStates.end());
47         // this becomes state 1, our start state
48         DFAState* start = getState(DFAState::Label(startStates));
49         this->scanState(start);
50 
51         this->computeMappings();
52 
53         int stateCount = 0;
54         for (const auto& row : fTransitions) {
55             stateCount = std::max(stateCount, (int) row.size());
56         }
57         return DFA(fCharMappings, fTransitions, fAccepts);
58     }
59 
60 private:
61     /**
62      * Returns an existing state with the given label, or creates a new one and returns it.
63      */
getState(DFAState::Label label)64     DFAState* getState(DFAState::Label label) {
65         auto found = fStates.find(label);
66         if (found == fStates.end()) {
67             int id = fStates.size();
68             fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
69             return fStates[label].get();
70         }
71         return found->second.get();
72     }
73 
add(int nfaState,std::vector<int> * states)74     void add(int nfaState, std::vector<int>* states) {
75         NFAState state = fNFA.fStates[nfaState];
76         if (state.fKind == NFAState::kRemapped_Kind) {
77             for (int next : state.fData) {
78                 this->add(next, states);
79             }
80         } else {
81             for (int state : *states) {
82                 if (nfaState == state) {
83                     return;
84                 }
85             }
86             states->push_back(nfaState);
87         }
88     }
89 
addTransition(char c,int start,int next)90     void addTransition(char c, int start, int next) {
91         while (fTransitions.size() <= (size_t) c) {
92             fTransitions.push_back(std::vector<int>());
93         }
94         std::vector<int>& row = fTransitions[c];
95         while (row.size() <= (size_t) start) {
96             row.push_back(INVALID);
97         }
98         row[start] = next;
99     }
100 
scanState(DFAState * state)101     void scanState(DFAState* state) {
102         state->fIsScanned = true;
103         for (char c = START_CHAR; c <= END_CHAR; ++c) {
104             std::vector<int> next;
105             int bestAccept = INT_MAX;
106             for (int idx : state->fLabel.fStates) {
107                 const NFAState& nfaState = fNFA.fStates[idx];
108                 if (nfaState.accept(c)) {
109                     for (int nextState : nfaState.fNext) {
110                         if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
111                             bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
112                         }
113                         this->add(nextState, &next);
114                     }
115                 }
116             }
117             std::sort(next.begin(), next.end());
118             DFAState* nextState = this->getState(DFAState::Label(next));
119             this->addTransition(c, state->fId, nextState->fId);
120             if (bestAccept != INT_MAX) {
121                 while (fAccepts.size() <= (size_t) nextState->fId) {
122                     fAccepts.push_back(INVALID);
123                 }
124                 fAccepts[nextState->fId] = bestAccept;
125             }
126             if (!nextState->fIsScanned) {
127                 this->scanState(nextState);
128             }
129         }
130     }
131 
132     // collapse rows with the same transitions to a single row. This is common, as each row
133     // represents a character and often there are many characters for which all transitions are
134     // identical (e.g. [0-9] are treated the same way by all lexer rules)
computeMappings()135     void computeMappings() {
136         // mappings[<input row>] = <output row>
137         std::vector<std::vector<int>*> uniques;
138         // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
139         for (size_t i = 0; i < fTransitions.size(); ++i) {
140             int found = -1;
141             for (size_t j = 0; j < uniques.size(); ++j) {
142                 if (*uniques[j] == fTransitions[i]) {
143                     found = j;
144                     break;
145                 }
146             }
147             if (found == -1) {
148                 found = (int) uniques.size();
149                 uniques.push_back(&fTransitions[i]);
150             }
151             fCharMappings.push_back(found);
152         }
153         std::vector<std::vector<int>> newTransitions;
154         for (std::vector<int>* row : uniques) {
155             newTransitions.push_back(*row);
156         }
157         fTransitions = newTransitions;
158     }
159 
160     const NFA& fNFA;
161     std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
162     std::vector<std::vector<int>> fTransitions;
163     std::vector<int> fCharMappings;
164     std::vector<int> fAccepts;
165 };
166