1 //===-- include/flang/Parser/characters.h -----------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef FORTRAN_PARSER_CHARACTERS_H_
10 #define FORTRAN_PARSER_CHARACTERS_H_
11 
12 // Define some character classification predicates and
13 // conversions here to avoid dependences upon <cctype> and
14 // also to accomodate Fortran tokenization.
15 
16 #include <cstddef>
17 #include <optional>
18 #include <string>
19 
20 namespace Fortran::parser {
21 
22 extern bool useHexadecimalEscapeSequences;
23 
24 // We can easily support Fortran program source in any character
25 // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
26 // The specific encodings that we can handle include:
27 //   LATIN_1: ISO 8859-1 Latin-1
28 //   UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
29 enum class Encoding { LATIN_1, UTF_8 };
30 
IsUpperCaseLetter(char ch)31 inline constexpr bool IsUpperCaseLetter(char ch) {
32   return ch >= 'A' && ch <= 'Z';
33 }
34 
IsLowerCaseLetter(char ch)35 inline constexpr bool IsLowerCaseLetter(char ch) {
36   return ch >= 'a' && ch <= 'z';
37 }
38 
IsLetter(char ch)39 inline constexpr bool IsLetter(char ch) {
40   return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
41 }
42 
IsDecimalDigit(char ch)43 inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
44 
IsHexadecimalDigit(char ch)45 inline constexpr bool IsHexadecimalDigit(char ch) {
46   return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
47       (ch >= 'a' && ch <= 'f');
48 }
49 
IsOctalDigit(char ch)50 inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
51 
IsLegalIdentifierStart(char ch)52 inline constexpr bool IsLegalIdentifierStart(char ch) {
53   return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
54 }
55 
IsLegalInIdentifier(char ch)56 inline constexpr bool IsLegalInIdentifier(char ch) {
57   return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
58 }
59 
ToLowerCaseLetter(char ch)60 inline constexpr char ToLowerCaseLetter(char ch) {
61   return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
62 }
63 
ToLowerCaseLetter(char && ch)64 inline constexpr char ToLowerCaseLetter(char &&ch) {
65   return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
66 }
67 
ToLowerCaseLetters(const std::string & str)68 inline std::string ToLowerCaseLetters(const std::string &str) {
69   std::string lowered{str};
70   for (char &ch : lowered) {
71     ch = ToLowerCaseLetter(ch);
72   }
73   return lowered;
74 }
75 
ToUpperCaseLetter(char ch)76 inline constexpr char ToUpperCaseLetter(char ch) {
77   return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
78 }
79 
ToUpperCaseLetter(char && ch)80 inline constexpr char ToUpperCaseLetter(char &&ch) {
81   return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
82 }
83 
ToUpperCaseLetters(const std::string & str)84 inline std::string ToUpperCaseLetters(const std::string &str) {
85   std::string raised{str};
86   for (char &ch : raised) {
87     ch = ToUpperCaseLetter(ch);
88   }
89   return raised;
90 }
91 
IsSameApartFromCase(char x,char y)92 inline constexpr bool IsSameApartFromCase(char x, char y) {
93   return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
94 }
95 
DecimalDigitValue(char ch)96 inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
97 
HexadecimalDigitValue(char ch)98 inline constexpr char HexadecimalDigitValue(char ch) {
99   return IsUpperCaseLetter(ch) ? ch - 'A' + 10
100       : IsLowerCaseLetter(ch)  ? ch - 'a' + 10
101                                : DecimalDigitValue(ch);
102 }
103 
BackslashEscapeValue(char ch)104 inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
105   switch (ch) {
106   case 'a':
107     return std::nullopt; // '\a';  PGF90 doesn't know \a
108   case 'b':
109     return '\b';
110   case 'f':
111     return '\f';
112   case 'n':
113     return '\n';
114   case 'r':
115     return '\r';
116   case 't':
117     return '\t';
118   case 'v':
119     return '\v';
120   case '"':
121   case '\'':
122   case '\\':
123     return ch;
124   default:
125     return std::nullopt;
126   }
127 }
128 
BackslashEscapeChar(char ch)129 inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
130   switch (ch) {
131   case '\a':
132     return std::nullopt; // 'a';  PGF90 doesn't know \a
133   case '\b':
134     return 'b';
135   case '\f':
136     return 'f';
137   case '\n':
138     return 'n';
139   case '\r':
140     return 'r';
141   case '\t':
142     return 't';
143   case '\v':
144     return 'v';
145   case '"':
146   case '\'':
147   case '\\':
148     return ch;
149   default:
150     return std::nullopt;
151   }
152 }
153 
154 // Does not include spaces or line ending characters.
IsValidFortranTokenCharacter(char ch)155 inline constexpr bool IsValidFortranTokenCharacter(char ch) {
156   switch (ch) {
157   case '"':
158   case '%':
159   case '\'':
160   case '(':
161   case ')':
162   case '*':
163   case '+':
164   case ',':
165   case '-':
166   case '.':
167   case '/':
168   case ':':
169   case ';':
170   case '<':
171   case '=':
172   case '>':
173   case '[':
174   case ']':
175     return true;
176   default:
177     return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
178   }
179 }
180 
181 struct EncodedCharacter {
182   static constexpr int maxEncodingBytes{6};
183   char buffer[maxEncodingBytes];
184   int bytes{0};
185 };
186 
187 template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
188 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
189 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
190 
191 EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
192 
193 template <Encoding ENCODING, typename STRING>
194 std::string EncodeString(const STRING &);
195 extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
196     const std::string &);
197 extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
198     const std::u32string &);
199 
200 // EmitQuotedChar drives callbacks "emit" and "insert" to output the
201 // bytes of an encoding for a codepoint.
202 template <typename NORMAL, typename INSERTED>
203 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
204     bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
205   auto emitOneByte{[&](std::uint8_t ch) {
206     if (backslashEscapes && (ch < ' ' || ch >= 0x7f || ch == '\\')) {
207       if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
208         insert('\\');
209         emit(*escape);
210       } else if (useHexadecimalEscapeSequences) {
211         insert('\\');
212         insert('x');
213         int top{ch >> 4}, bottom{ch & 0xf};
214         insert(top > 9 ? 'a' + top - 10 : '0' + top);
215         insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom);
216       } else {
217         // octal escape sequence; always emit 3 digits to avoid ambiguity
218         insert('\\');
219         insert('0' + (ch >> 6));
220         insert('0' + ((ch >> 3) & 7));
221         insert('0' + (ch & 7));
222       }
223     } else if (ch == '\n') { // always escape newlines
224       insert('\\');
225       insert('n');
226     } else {
227       emit(ch);
228     }
229   }};
230   if (ch <= 0x7f) {
231     emitOneByte(ch);
232   } else {
233     EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
234     for (int j{0}; j < encoded.bytes; ++j) {
235       emitOneByte(encoded.buffer[j]);
236     }
237   }
238 }
239 
240 std::string QuoteCharacterLiteral(const std::string &,
241     bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
242 std::string QuoteCharacterLiteral(const std::u16string &,
243     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
244 std::string QuoteCharacterLiteral(const std::u32string &,
245     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
246 
247 int UTF_8CharacterBytes(const char *);
248 
249 struct DecodedCharacter {
250   char32_t codepoint{0};
251   int bytes{0}; // signifying failure
252 };
253 
254 template <Encoding ENCODING>
255 DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
256 template <>
257 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
258     const char *, std::size_t);
259 
260 template <>
261 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
262 
263 // DecodeCharacter optionally handles backslash escape sequences, too.
264 template <Encoding ENCODING>
265 DecodedCharacter DecodeCharacter(
266     const char *, std::size_t, bool backslashEscapes);
267 extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
268     const char *, std::size_t, bool);
269 extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
270     const char *, std::size_t, bool);
271 
272 DecodedCharacter DecodeCharacter(
273     Encoding, const char *, std::size_t, bool backslashEscapes);
274 
275 template <typename RESULT, Encoding ENCODING>
276 RESULT DecodeString(const std::string &, bool backslashEscapes);
277 extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
278     const std::string &, bool);
279 extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
280     const std::string &, bool);
281 extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
282     const std::string &, bool);
283 } // namespace Fortran::parser
284 #endif // FORTRAN_PARSER_CHARACTERS_H_
285