1 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
2 // The remapped transition table is justified at
3 // https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14
4 
5 #include <stdint.h>
6 
7 #ifndef __UTF8_DFA_DECODER_H
8 #define __UTF8_DFA_DECODER_H
9 
10 namespace Utf8DfaDecoder {
11 
12 enum State : uint8_t {
13   kReject = 0,
14   kAccept = 12,
15   kTwoByte = 24,
16   kThreeByte = 36,
17   kThreeByteLowMid = 48,
18   kFourByte = 60,
19   kFourByteLow = 72,
20   kThreeByteHigh = 84,
21   kFourByteMidHigh = 96,
22 };
23 
Decode(uint8_t byte,State * state,uint32_t * buffer)24 static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
25   // This first table maps bytes to character to a transition.
26   static constexpr uint8_t transitions[] = {
27       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 00-0F
28       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 10-1F
29       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 20-2F
30       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 30-3F
31       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 40-4F
32       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 50-5F
33       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 60-6F
34       0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 70-7F
35       1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80-8F
36       2,  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 90-9F
37       3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // A0-AF
38       3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // B0-BF
39       9,  9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // C0-CF
40       4,  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // D0-DF
41       10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5,  // E0-EF
42       11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,  // F0-FF
43   };
44 
45   // This second table maps a state to a new state when adding a transition.
46   //  00-7F
47   //  |   80-8F
48   //  |   |   90-9F
49   //  |   |   |   A0-BF
50   //  |   |   |   |   C2-DF
51   //  |   |   |   |   |   E1-EC, EE, EF
52   //  |   |   |   |   |   |   ED
53   //  |   |   |   |   |   |   |   F1-F3
54   //  |   |   |   |   |   |   |   |   F4
55   //  |   |   |   |   |   |   |   |   |   C0, C1, F5-FF
56   //  |   |   |   |   |   |   |   |   |   |  E0
57   //  |   |   |   |   |   |   |   |   |   |  |   F0
58   static constexpr uint8_t states[] = {
59       0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // REJECT = 0
60       12, 0,  0,  0,  24, 36, 48, 60, 72, 0, 84, 96,  // ACCEPT = 12
61       0,  12, 12, 12, 0,  0,  0,  0,  0,  0, 0,  0,   // 2-byte = 24
62       0,  24, 24, 24, 0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte = 36
63       0,  24, 24, 0,  0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte low/mid = 48
64       0,  36, 36, 36, 0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte = 60
65       0,  36, 0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte low = 72
66       0,  0,  0,  24, 0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte high = 84
67       0,  0,  36, 36, 0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte mid/high = 96
68   };
69 
70   uint8_t type = transitions[byte];
71   *state = static_cast<State>(states[*state + type]);
72   *buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
73 }
74 
75 }  // namespace Utf8DfaDecoder
76 
77 #endif /* __UTF8_DFA_DECODER_H */
78