1 //===- DependencyDirectivesSourceMinimizer.cpp -  -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This is the implementation for minimizing header and source files to the
11 /// minimum necessary preprocessor directives for evaluating includes. It
12 /// reduces the source down to #define, #include, #import, @import, and any
13 /// conditional preprocessor logic that contains one of those.
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #include "clang/Lex/DependencyDirectivesSourceMinimizer.h"
18 #include "clang/Basic/CharInfo.h"
19 #include "clang/Basic/Diagnostic.h"
20 #include "clang/Lex/LexDiagnostic.h"
21 #include "llvm/ADT/StringMap.h"
22 #include "llvm/ADT/StringSwitch.h"
23 #include "llvm/Support/MemoryBuffer.h"
24 
25 using namespace llvm;
26 using namespace clang;
27 using namespace clang::minimize_source_to_dependency_directives;
28 
29 namespace {
30 
31 struct Minimizer {
32   /// Minimized output.
33   SmallVectorImpl<char> &Out;
34   /// The known tokens encountered during the minimization.
35   SmallVectorImpl<Token> &Tokens;
36 
Minimizer__anone77734380111::Minimizer37   Minimizer(SmallVectorImpl<char> &Out, SmallVectorImpl<Token> &Tokens,
38             StringRef Input, DiagnosticsEngine *Diags,
39             SourceLocation InputSourceLoc)
40       : Out(Out), Tokens(Tokens), Input(Input), Diags(Diags),
41         InputSourceLoc(InputSourceLoc) {}
42 
43   /// Lex the provided source and emit the minimized output.
44   ///
45   /// \returns True on error.
46   bool minimize();
47 
48 private:
49   struct IdInfo {
50     const char *Last;
51     StringRef Name;
52   };
53 
54   /// Lex an identifier.
55   ///
56   /// \pre First points at a valid identifier head.
57   LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End);
58   LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First,
59                                        const char *const End);
60   LLVM_NODISCARD bool minimizeImpl(const char *First, const char *const End);
61   LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End);
62   LLVM_NODISCARD bool lexAt(const char *&First, const char *const End);
63   LLVM_NODISCARD bool lexModule(const char *&First, const char *const End);
64   LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End);
65   LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End);
66   LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End);
67   LLVM_NODISCARD bool lexDefault(TokenKind Kind, StringRef Directive,
68                                  const char *&First, const char *const End);
makeToken__anone77734380111::Minimizer69   Token &makeToken(TokenKind K) {
70     Tokens.emplace_back(K, Out.size());
71     return Tokens.back();
72   }
popToken__anone77734380111::Minimizer73   void popToken() {
74     Out.resize(Tokens.back().Offset);
75     Tokens.pop_back();
76   }
top__anone77734380111::Minimizer77   TokenKind top() const { return Tokens.empty() ? pp_none : Tokens.back().K; }
78 
put__anone77734380111::Minimizer79   Minimizer &put(char Byte) {
80     Out.push_back(Byte);
81     return *this;
82   }
append__anone77734380111::Minimizer83   Minimizer &append(StringRef S) { return append(S.begin(), S.end()); }
append__anone77734380111::Minimizer84   Minimizer &append(const char *First, const char *Last) {
85     Out.append(First, Last);
86     return *this;
87   }
88 
89   void printToNewline(const char *&First, const char *const End);
90   void printAdjacentModuleNameParts(const char *&First, const char *const End);
91   LLVM_NODISCARD bool printAtImportBody(const char *&First,
92                                         const char *const End);
93   void printDirectiveBody(const char *&First, const char *const End);
94   void printAdjacentMacroArgs(const char *&First, const char *const End);
95   LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End);
96 
97   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
98   /// true at the end.
99   bool reportError(const char *CurPtr, unsigned Err);
100 
101   StringMap<char> SplitIds;
102   StringRef Input;
103   DiagnosticsEngine *Diags;
104   SourceLocation InputSourceLoc;
105 };
106 
107 } // end anonymous namespace
108 
reportError(const char * CurPtr,unsigned Err)109 bool Minimizer::reportError(const char *CurPtr, unsigned Err) {
110   if (!Diags)
111     return true;
112   assert(CurPtr >= Input.data() && "invalid buffer ptr");
113   Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err);
114   return true;
115 }
116 
skipOverSpaces(const char * & First,const char * const End)117 static void skipOverSpaces(const char *&First, const char *const End) {
118   while (First != End && isHorizontalWhitespace(*First))
119     ++First;
120 }
121 
isRawStringLiteral(const char * First,const char * Current)122 LLVM_NODISCARD static bool isRawStringLiteral(const char *First,
123                                               const char *Current) {
124   assert(First <= Current);
125 
126   // Check if we can even back up.
127   if (*Current != '"' || First == Current)
128     return false;
129 
130   // Check for an "R".
131   --Current;
132   if (*Current != 'R')
133     return false;
134   if (First == Current || !isIdentifierBody(*--Current))
135     return true;
136 
137   // Check for a prefix of "u", "U", or "L".
138   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
139     return First == Current || !isIdentifierBody(*--Current);
140 
141   // Check for a prefix of "u8".
142   if (*Current != '8' || First == Current || *Current-- != 'u')
143     return false;
144   return First == Current || !isIdentifierBody(*--Current);
145 }
146 
skipRawString(const char * & First,const char * const End)147 static void skipRawString(const char *&First, const char *const End) {
148   assert(First[0] == '"');
149   assert(First[-1] == 'R');
150 
151   const char *Last = ++First;
152   while (Last != End && *Last != '(')
153     ++Last;
154   if (Last == End) {
155     First = Last; // Hit the end... just give up.
156     return;
157   }
158 
159   StringRef Terminator(First, Last - First);
160   for (;;) {
161     // Move First to just past the next ")".
162     First = Last;
163     while (First != End && *First != ')')
164       ++First;
165     if (First == End)
166       return;
167     ++First;
168 
169     // Look ahead for the terminator sequence.
170     Last = First;
171     while (Last != End && size_t(Last - First) < Terminator.size() &&
172            Terminator[Last - First] == *Last)
173       ++Last;
174 
175     // Check if we hit it (or the end of the file).
176     if (Last == End) {
177       First = Last;
178       return;
179     }
180     if (size_t(Last - First) < Terminator.size())
181       continue;
182     if (*Last != '"')
183       continue;
184     First = Last + 1;
185     return;
186   }
187 }
188 
189 // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
isEOL(const char * First,const char * const End)190 static unsigned isEOL(const char *First, const char *const End) {
191   if (First == End)
192     return 0;
193   if (End - First > 1 && isVerticalWhitespace(First[0]) &&
194       isVerticalWhitespace(First[1]) && First[0] != First[1])
195     return 2;
196   return !!isVerticalWhitespace(First[0]);
197 }
198 
skipString(const char * & First,const char * const End)199 static void skipString(const char *&First, const char *const End) {
200   assert(*First == '\'' || *First == '"' || *First == '<');
201   const char Terminator = *First == '<' ? '>' : *First;
202   for (++First; First != End && *First != Terminator; ++First) {
203     // String and character literals don't extend past the end of the line.
204     if (isVerticalWhitespace(*First))
205       return;
206     if (*First != '\\')
207       continue;
208     // Skip past backslash to the next character. This ensures that the
209     // character right after it is skipped as well, which matters if it's
210     // the terminator.
211     if (++First == End)
212       return;
213     if (!isWhitespace(*First))
214       continue;
215     // Whitespace after the backslash might indicate a line continuation.
216     const char *FirstAfterBackslashPastSpace = First;
217     skipOverSpaces(FirstAfterBackslashPastSpace, End);
218     if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
219       // Advance the character pointer to the next line for the next
220       // iteration.
221       First = FirstAfterBackslashPastSpace + NLSize - 1;
222     }
223   }
224   if (First != End)
225     ++First; // Finish off the string.
226 }
227 
228 // Returns the length of the skipped newline
skipNewline(const char * & First,const char * End)229 static unsigned skipNewline(const char *&First, const char *End) {
230   if (First == End)
231     return 0;
232   assert(isVerticalWhitespace(*First));
233   unsigned Len = isEOL(First, End);
234   assert(Len && "expected newline");
235   First += Len;
236   return Len;
237 }
238 
wasLineContinuation(const char * First,unsigned EOLLen)239 static bool wasLineContinuation(const char *First, unsigned EOLLen) {
240   return *(First - (int)EOLLen - 1) == '\\';
241 }
242 
skipToNewlineRaw(const char * & First,const char * const End)243 static void skipToNewlineRaw(const char *&First, const char *const End) {
244   for (;;) {
245     if (First == End)
246       return;
247 
248     unsigned Len = isEOL(First, End);
249     if (Len)
250       return;
251 
252     do {
253       if (++First == End)
254         return;
255       Len = isEOL(First, End);
256     } while (!Len);
257 
258     if (First[-1] != '\\')
259       return;
260 
261     First += Len;
262     // Keep skipping lines...
263   }
264 }
265 
findLastNonSpace(const char * First,const char * Last)266 static const char *findLastNonSpace(const char *First, const char *Last) {
267   assert(First <= Last);
268   while (First != Last && isHorizontalWhitespace(Last[-1]))
269     --Last;
270   return Last;
271 }
272 
findFirstTrailingSpace(const char * First,const char * Last)273 static const char *findFirstTrailingSpace(const char *First,
274                                           const char *Last) {
275   const char *LastNonSpace = findLastNonSpace(First, Last);
276   if (Last == LastNonSpace)
277     return Last;
278   assert(isHorizontalWhitespace(LastNonSpace[0]));
279   return LastNonSpace + 1;
280 }
281 
skipLineComment(const char * & First,const char * const End)282 static void skipLineComment(const char *&First, const char *const End) {
283   assert(First[0] == '/' && First[1] == '/');
284   First += 2;
285   skipToNewlineRaw(First, End);
286 }
287 
skipBlockComment(const char * & First,const char * const End)288 static void skipBlockComment(const char *&First, const char *const End) {
289   assert(First[0] == '/' && First[1] == '*');
290   if (End - First < 4) {
291     First = End;
292     return;
293   }
294   for (First += 3; First != End; ++First)
295     if (First[-1] == '*' && First[0] == '/') {
296       ++First;
297       return;
298     }
299 }
300 
301 /// \returns True if the current single quotation mark character is a C++ 14
302 /// digit separator.
isQuoteCppDigitSeparator(const char * const Start,const char * const Cur,const char * const End)303 static bool isQuoteCppDigitSeparator(const char *const Start,
304                                      const char *const Cur,
305                                      const char *const End) {
306   assert(*Cur == '\'' && "expected quotation character");
307   // skipLine called in places where we don't expect a valid number
308   // body before `start` on the same line, so always return false at the start.
309   if (Start == Cur)
310     return false;
311   // The previous character must be a valid PP number character.
312   // Make sure that the L, u, U, u8 prefixes don't get marked as a
313   // separator though.
314   char Prev = *(Cur - 1);
315   if (Prev == 'L' || Prev == 'U' || Prev == 'u')
316     return false;
317   if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
318     return false;
319   if (!isPreprocessingNumberBody(Prev))
320     return false;
321   // The next character should be a valid identifier body character.
322   return (Cur + 1) < End && isIdentifierBody(*(Cur + 1));
323 }
324 
skipLine(const char * & First,const char * const End)325 static void skipLine(const char *&First, const char *const End) {
326   for (;;) {
327     assert(First <= End);
328     if (First == End)
329       return;
330 
331     if (isVerticalWhitespace(*First)) {
332       skipNewline(First, End);
333       return;
334     }
335     const char *Start = First;
336     while (First != End && !isVerticalWhitespace(*First)) {
337       // Iterate over strings correctly to avoid comments and newlines.
338       if (*First == '"' ||
339           (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
340         if (isRawStringLiteral(Start, First))
341           skipRawString(First, End);
342         else
343           skipString(First, End);
344         continue;
345       }
346 
347       // Iterate over comments correctly.
348       if (*First != '/' || End - First < 2) {
349         ++First;
350         continue;
351       }
352 
353       if (First[1] == '/') {
354         // "//...".
355         skipLineComment(First, End);
356         continue;
357       }
358 
359       if (First[1] != '*') {
360         ++First;
361         continue;
362       }
363 
364       // "/*...*/".
365       skipBlockComment(First, End);
366     }
367     if (First == End)
368       return;
369 
370     // Skip over the newline.
371     unsigned Len = skipNewline(First, End);
372     if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
373       break;
374   }
375 }
376 
skipDirective(StringRef Name,const char * & First,const char * const End)377 static void skipDirective(StringRef Name, const char *&First,
378                           const char *const End) {
379   if (llvm::StringSwitch<bool>(Name)
380           .Case("warning", true)
381           .Case("error", true)
382           .Default(false))
383     // Do not process quotes or comments.
384     skipToNewlineRaw(First, End);
385   else
386     skipLine(First, End);
387 }
388 
printToNewline(const char * & First,const char * const End)389 void Minimizer::printToNewline(const char *&First, const char *const End) {
390   while (First != End && !isVerticalWhitespace(*First)) {
391     const char *Last = First;
392     do {
393       // Iterate over strings correctly to avoid comments and newlines.
394       if (*Last == '"' || *Last == '\'' ||
395           (*Last == '<' && top() == pp_include)) {
396         if (LLVM_UNLIKELY(isRawStringLiteral(First, Last)))
397           skipRawString(Last, End);
398         else
399           skipString(Last, End);
400         continue;
401       }
402       if (*Last != '/' || End - Last < 2) {
403         ++Last;
404         continue; // Gather the rest up to print verbatim.
405       }
406 
407       if (Last[1] != '/' && Last[1] != '*') {
408         ++Last;
409         continue;
410       }
411 
412       // Deal with "//..." and "/*...*/".
413       append(First, findFirstTrailingSpace(First, Last));
414       First = Last;
415 
416       if (Last[1] == '/') {
417         skipLineComment(First, End);
418         return;
419       }
420 
421       put(' ');
422       skipBlockComment(First, End);
423       skipOverSpaces(First, End);
424       Last = First;
425     } while (Last != End && !isVerticalWhitespace(*Last));
426 
427     // Print out the string.
428     const char *LastBeforeTrailingSpace = findLastNonSpace(First, Last);
429     if (Last == End || LastBeforeTrailingSpace == First ||
430         LastBeforeTrailingSpace[-1] != '\\') {
431       append(First, LastBeforeTrailingSpace);
432       First = Last;
433       skipNewline(First, End);
434       return;
435     }
436 
437     // Print up to the backslash, backing up over spaces. Preserve at least one
438     // space, as the space matters when tokens are separated by a line
439     // continuation.
440     append(First, findFirstTrailingSpace(
441                       First, LastBeforeTrailingSpace - 1));
442 
443     First = Last;
444     skipNewline(First, End);
445     skipOverSpaces(First, End);
446   }
447 }
448 
skipWhitespace(const char * & First,const char * const End)449 static void skipWhitespace(const char *&First, const char *const End) {
450   for (;;) {
451     assert(First <= End);
452     skipOverSpaces(First, End);
453 
454     if (End - First < 2)
455       return;
456 
457     if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
458       skipNewline(++First, End);
459       continue;
460     }
461 
462     // Check for a non-comment character.
463     if (First[0] != '/')
464       return;
465 
466     // "// ...".
467     if (First[1] == '/') {
468       skipLineComment(First, End);
469       return;
470     }
471 
472     // Cannot be a comment.
473     if (First[1] != '*')
474       return;
475 
476     // "/*...*/".
477     skipBlockComment(First, End);
478   }
479 }
480 
printAdjacentModuleNameParts(const char * & First,const char * const End)481 void Minimizer::printAdjacentModuleNameParts(const char *&First,
482                                              const char *const End) {
483   // Skip over parts of the body.
484   const char *Last = First;
485   do
486     ++Last;
487   while (Last != End && (isIdentifierBody(*Last) || *Last == '.'));
488   append(First, Last);
489   First = Last;
490 }
491 
printAtImportBody(const char * & First,const char * const End)492 bool Minimizer::printAtImportBody(const char *&First, const char *const End) {
493   for (;;) {
494     skipWhitespace(First, End);
495     if (First == End)
496       return true;
497 
498     if (isVerticalWhitespace(*First)) {
499       skipNewline(First, End);
500       continue;
501     }
502 
503     // Found a semicolon.
504     if (*First == ';') {
505       put(*First++).put('\n');
506       return false;
507     }
508 
509     // Don't handle macro expansions inside @import for now.
510     if (!isIdentifierBody(*First) && *First != '.')
511       return true;
512 
513     printAdjacentModuleNameParts(First, End);
514   }
515 }
516 
printDirectiveBody(const char * & First,const char * const End)517 void Minimizer::printDirectiveBody(const char *&First, const char *const End) {
518   skipWhitespace(First, End); // Skip initial whitespace.
519   printToNewline(First, End);
520   while (Out.back() == ' ')
521     Out.pop_back();
522   put('\n');
523 }
524 
lexRawIdentifier(const char * First,const char * const End)525 LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
526                                                    const char *const End) {
527   assert(isIdentifierBody(*First) && "invalid identifer");
528   const char *Last = First + 1;
529   while (Last != End && isIdentifierBody(*Last))
530     ++Last;
531   return Last;
532 }
533 
534 LLVM_NODISCARD static const char *
getIdentifierContinuation(const char * First,const char * const End)535 getIdentifierContinuation(const char *First, const char *const End) {
536   if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1]))
537     return nullptr;
538 
539   ++First;
540   skipNewline(First, End);
541   if (First == End)
542     return nullptr;
543   return isIdentifierBody(First[0]) ? First : nullptr;
544 }
545 
lexIdentifier(const char * First,const char * const End)546 Minimizer::IdInfo Minimizer::lexIdentifier(const char *First,
547                                            const char *const End) {
548   const char *Last = lexRawIdentifier(First, End);
549   const char *Next = getIdentifierContinuation(Last, End);
550   if (LLVM_LIKELY(!Next))
551     return IdInfo{Last, StringRef(First, Last - First)};
552 
553   // Slow path, where identifiers are split over lines.
554   SmallVector<char, 64> Id(First, Last);
555   while (Next) {
556     Last = lexRawIdentifier(Next, End);
557     Id.append(Next, Last);
558     Next = getIdentifierContinuation(Last, End);
559   }
560   return IdInfo{
561       Last,
562       SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()};
563 }
564 
printAdjacentMacroArgs(const char * & First,const char * const End)565 void Minimizer::printAdjacentMacroArgs(const char *&First,
566                                        const char *const End) {
567   // Skip over parts of the body.
568   const char *Last = First;
569   do
570     ++Last;
571   while (Last != End &&
572          (isIdentifierBody(*Last) || *Last == '.' || *Last == ','));
573   append(First, Last);
574   First = Last;
575 }
576 
printMacroArgs(const char * & First,const char * const End)577 bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
578   assert(*First == '(');
579   put(*First++);
580   for (;;) {
581     skipWhitespace(First, End);
582     if (First == End)
583       return true;
584 
585     if (*First == ')') {
586       put(*First++);
587       return false;
588     }
589 
590     // This is intentionally fairly liberal.
591     if (!(isIdentifierBody(*First) || *First == '.' || *First == ','))
592       return true;
593 
594     printAdjacentMacroArgs(First, End);
595   }
596 }
597 
598 /// Looks for an identifier starting from Last.
599 ///
600 /// Updates "First" to just past the next identifier, if any.  Returns true iff
601 /// the identifier matches "Id".
isNextIdentifier(StringRef Id,const char * & First,const char * const End)602 bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
603                                  const char *const End) {
604   skipWhitespace(First, End);
605   if (First == End || !isIdentifierHead(*First))
606     return false;
607 
608   IdInfo FoundId = lexIdentifier(First, End);
609   First = FoundId.Last;
610   return FoundId.Name == Id;
611 }
612 
lexAt(const char * & First,const char * const End)613 bool Minimizer::lexAt(const char *&First, const char *const End) {
614   // Handle "@import".
615   const char *ImportLoc = First++;
616   if (!isNextIdentifier("import", First, End)) {
617     skipLine(First, End);
618     return false;
619   }
620   makeToken(decl_at_import);
621   append("@import ");
622   if (printAtImportBody(First, End))
623     return reportError(
624         ImportLoc, diag::err_dep_source_minimizer_missing_sema_after_at_import);
625   skipWhitespace(First, End);
626   if (First == End)
627     return false;
628   if (!isVerticalWhitespace(*First))
629     return reportError(
630         ImportLoc, diag::err_dep_source_minimizer_unexpected_tokens_at_import);
631   skipNewline(First, End);
632   return false;
633 }
634 
lexModule(const char * & First,const char * const End)635 bool Minimizer::lexModule(const char *&First, const char *const End) {
636   IdInfo Id = lexIdentifier(First, End);
637   First = Id.Last;
638   bool Export = false;
639   if (Id.Name == "export") {
640     Export = true;
641     skipWhitespace(First, End);
642     if (!isIdentifierBody(*First)) {
643       skipLine(First, End);
644       return false;
645     }
646     Id = lexIdentifier(First, End);
647     First = Id.Last;
648   }
649 
650   if (Id.Name != "module" && Id.Name != "import") {
651     skipLine(First, End);
652     return false;
653   }
654 
655   skipWhitespace(First, End);
656 
657   // Ignore this as a module directive if the next character can't be part of
658   // an import.
659 
660   switch (*First) {
661   case ':':
662   case '<':
663   case '"':
664     break;
665   default:
666     if (!isIdentifierBody(*First)) {
667       skipLine(First, End);
668       return false;
669     }
670   }
671 
672   if (Export) {
673     makeToken(cxx_export_decl);
674     append("export ");
675   }
676 
677   if (Id.Name == "module")
678     makeToken(cxx_module_decl);
679   else
680     makeToken(cxx_import_decl);
681   append(Id.Name);
682   append(" ");
683   printToNewline(First, End);
684   append("\n");
685   return false;
686 }
687 
lexDefine(const char * & First,const char * const End)688 bool Minimizer::lexDefine(const char *&First, const char *const End) {
689   makeToken(pp_define);
690   append("#define ");
691   skipWhitespace(First, End);
692 
693   if (!isIdentifierHead(*First))
694     return reportError(First, diag::err_pp_macro_not_identifier);
695 
696   IdInfo Id = lexIdentifier(First, End);
697   const char *Last = Id.Last;
698   append(Id.Name);
699   if (Last == End)
700     return false;
701   if (*Last == '(') {
702     size_t Size = Out.size();
703     if (printMacroArgs(Last, End)) {
704       // Be robust to bad macro arguments, since they can show up in disabled
705       // code.
706       Out.resize(Size);
707       append("(/* invalid */\n");
708       skipLine(Last, End);
709       return false;
710     }
711   }
712   skipWhitespace(Last, End);
713   if (Last == End)
714     return false;
715   if (!isVerticalWhitespace(*Last))
716     put(' ');
717   printDirectiveBody(Last, End);
718   First = Last;
719   return false;
720 }
721 
lexPragma(const char * & First,const char * const End)722 bool Minimizer::lexPragma(const char *&First, const char *const End) {
723   // #pragma.
724   skipWhitespace(First, End);
725   if (First == End || !isIdentifierHead(*First))
726     return false;
727 
728   IdInfo FoundId = lexIdentifier(First, End);
729   First = FoundId.Last;
730   if (FoundId.Name == "once") {
731     // #pragma once
732     skipLine(First, End);
733     makeToken(pp_pragma_once);
734     append("#pragma once\n");
735     return false;
736   }
737 
738   if (FoundId.Name != "clang") {
739     skipLine(First, End);
740     return false;
741   }
742 
743   // #pragma clang.
744   if (!isNextIdentifier("module", First, End)) {
745     skipLine(First, End);
746     return false;
747   }
748 
749   // #pragma clang module.
750   if (!isNextIdentifier("import", First, End)) {
751     skipLine(First, End);
752     return false;
753   }
754 
755   // #pragma clang module import.
756   makeToken(pp_pragma_import);
757   append("#pragma clang module import ");
758   printDirectiveBody(First, End);
759   return false;
760 }
761 
lexEndif(const char * & First,const char * const End)762 bool Minimizer::lexEndif(const char *&First, const char *const End) {
763   // Strip out "#else" if it's empty.
764   if (top() == pp_else)
765     popToken();
766 
767   // If "#ifdef" is empty, strip it and skip the "#endif".
768   //
769   // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
770   // we can skip empty `#if` and `#elif` blocks as well after scanning for a
771   // literal __has_include in the condition.  Even without that rule we could
772   // drop the tokens if we scan for identifiers in the condition and find none.
773   if (top() == pp_ifdef || top() == pp_ifndef) {
774     popToken();
775     skipLine(First, End);
776     return false;
777   }
778 
779   return lexDefault(pp_endif, "endif", First, End);
780 }
781 
lexDefault(TokenKind Kind,StringRef Directive,const char * & First,const char * const End)782 bool Minimizer::lexDefault(TokenKind Kind, StringRef Directive,
783                            const char *&First, const char *const End) {
784   makeToken(Kind);
785   put('#').append(Directive).put(' ');
786   printDirectiveBody(First, End);
787   return false;
788 }
789 
isStartOfRelevantLine(char First)790 static bool isStartOfRelevantLine(char First) {
791   switch (First) {
792   case '#':
793   case '@':
794   case 'i':
795   case 'e':
796   case 'm':
797     return true;
798   }
799   return false;
800 }
801 
lexPPLine(const char * & First,const char * const End)802 bool Minimizer::lexPPLine(const char *&First, const char *const End) {
803   assert(First != End);
804 
805   skipWhitespace(First, End);
806   assert(First <= End);
807   if (First == End)
808     return false;
809 
810   if (!isStartOfRelevantLine(*First)) {
811     skipLine(First, End);
812     assert(First <= End);
813     return false;
814   }
815 
816   // Handle "@import".
817   if (*First == '@')
818     return lexAt(First, End);
819 
820   if (*First == 'i' || *First == 'e' || *First == 'm')
821     return lexModule(First, End);
822 
823   // Handle preprocessing directives.
824   ++First; // Skip over '#'.
825   skipWhitespace(First, End);
826 
827   if (First == End)
828     return reportError(First, diag::err_pp_expected_eol);
829 
830   if (!isIdentifierHead(*First)) {
831     skipLine(First, End);
832     return false;
833   }
834 
835   // Figure out the token.
836   IdInfo Id = lexIdentifier(First, End);
837   First = Id.Last;
838   auto Kind = llvm::StringSwitch<TokenKind>(Id.Name)
839                   .Case("include", pp_include)
840                   .Case("__include_macros", pp___include_macros)
841                   .Case("define", pp_define)
842                   .Case("undef", pp_undef)
843                   .Case("import", pp_import)
844                   .Case("include_next", pp_include_next)
845                   .Case("if", pp_if)
846                   .Case("ifdef", pp_ifdef)
847                   .Case("ifndef", pp_ifndef)
848                   .Case("elif", pp_elif)
849                   .Case("else", pp_else)
850                   .Case("endif", pp_endif)
851                   .Case("pragma", pp_pragma_import)
852                   .Default(pp_none);
853   if (Kind == pp_none) {
854     skipDirective(Id.Name, First, End);
855     return false;
856   }
857 
858   if (Kind == pp_endif)
859     return lexEndif(First, End);
860 
861   if (Kind == pp_define)
862     return lexDefine(First, End);
863 
864   if (Kind == pp_pragma_import)
865     return lexPragma(First, End);
866 
867   // Everything else.
868   return lexDefault(Kind, Id.Name, First, End);
869 }
870 
skipUTF8ByteOrderMark(const char * & First,const char * const End)871 static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
872   if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
873       First[2] == '\xbf')
874     First += 3;
875 }
876 
minimizeImpl(const char * First,const char * const End)877 bool Minimizer::minimizeImpl(const char *First, const char *const End) {
878   skipUTF8ByteOrderMark(First, End);
879   while (First != End)
880     if (lexPPLine(First, End))
881       return true;
882   return false;
883 }
884 
minimize()885 bool Minimizer::minimize() {
886   bool Error = minimizeImpl(Input.begin(), Input.end());
887 
888   if (!Error) {
889     // Add a trailing newline and an EOF on success.
890     if (!Out.empty() && Out.back() != '\n')
891       Out.push_back('\n');
892     makeToken(pp_eof);
893   }
894 
895   // Null-terminate the output. This way the memory buffer that's passed to
896   // Clang will not have to worry about the terminating '\0'.
897   Out.push_back(0);
898   Out.pop_back();
899   return Error;
900 }
901 
computeSkippedRanges(ArrayRef<Token> Input,llvm::SmallVectorImpl<SkippedRange> & Range)902 bool clang::minimize_source_to_dependency_directives::computeSkippedRanges(
903     ArrayRef<Token> Input, llvm::SmallVectorImpl<SkippedRange> &Range) {
904   struct Directive {
905     enum DirectiveKind {
906       If,  // if/ifdef/ifndef
907       Else // elif,else
908     };
909     int Offset;
910     DirectiveKind Kind;
911   };
912   llvm::SmallVector<Directive, 32> Offsets;
913   for (const Token &T : Input) {
914     switch (T.K) {
915     case pp_if:
916     case pp_ifdef:
917     case pp_ifndef:
918       Offsets.push_back({T.Offset, Directive::If});
919       break;
920 
921     case pp_elif:
922     case pp_else: {
923       if (Offsets.empty())
924         return true;
925       int PreviousOffset = Offsets.back().Offset;
926       Range.push_back({PreviousOffset, T.Offset - PreviousOffset});
927       Offsets.push_back({T.Offset, Directive::Else});
928       break;
929     }
930 
931     case pp_endif: {
932       if (Offsets.empty())
933         return true;
934       int PreviousOffset = Offsets.back().Offset;
935       Range.push_back({PreviousOffset, T.Offset - PreviousOffset});
936       do {
937         Directive::DirectiveKind Kind = Offsets.pop_back_val().Kind;
938         if (Kind == Directive::If)
939           break;
940       } while (!Offsets.empty());
941       break;
942     }
943     default:
944       break;
945     }
946   }
947   return false;
948 }
949 
minimizeSourceToDependencyDirectives(StringRef Input,SmallVectorImpl<char> & Output,SmallVectorImpl<Token> & Tokens,DiagnosticsEngine * Diags,SourceLocation InputSourceLoc)950 bool clang::minimizeSourceToDependencyDirectives(
951     StringRef Input, SmallVectorImpl<char> &Output,
952     SmallVectorImpl<Token> &Tokens, DiagnosticsEngine *Diags,
953     SourceLocation InputSourceLoc) {
954   Output.clear();
955   Tokens.clear();
956   return Minimizer(Output, Tokens, Input, Diags, InputSourceLoc).minimize();
957 }
958