1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/AST/CommentDiagnostic.h"
4 #include "clang/Basic/CharInfo.h"
5 #include "llvm/ADT/StringExtras.h"
6 #include "llvm/ADT/StringSwitch.h"
7 #include "llvm/Support/ConvertUTF.h"
8 #include "llvm/Support/ErrorHandling.h"
9 
10 namespace clang {
11 namespace comments {
12 
dump(const Lexer & L,const SourceManager & SM) const13 void Token::dump(const Lexer &L, const SourceManager &SM) const {
14   llvm::errs() << "comments::Token Kind=" << Kind << " ";
15   Loc.dump(SM);
16   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17 }
18 
isHTMLNamedCharacterReferenceCharacter(char C)19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20   return isLetter(C);
21 }
22 
isHTMLDecimalCharacterReferenceCharacter(char C)23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24   return isDigit(C);
25 }
26 
isHTMLHexCharacterReferenceCharacter(char C)27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28   return isHexDigit(C);
29 }
30 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)31 static inline StringRef convertCodePointToUTF8(
32                                       llvm::BumpPtrAllocator &Allocator,
33                                       unsigned CodePoint) {
34   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35   char *ResolvedPtr = Resolved;
36   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37     return StringRef(Resolved, ResolvedPtr - Resolved);
38   else
39     return StringRef();
40 }
41 
42 namespace {
43 
44 #include "clang/AST/CommentHTMLTags.inc"
45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46 
47 } // unnamed namespace
48 
resolveHTMLNamedCharacterReference(StringRef Name) const49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50   // Fast path, first check a few most widely used named character references.
51   return llvm::StringSwitch<StringRef>(Name)
52       .Case("amp", "&")
53       .Case("lt", "<")
54       .Case("gt", ">")
55       .Case("quot", "\"")
56       .Case("apos", "\'")
57       // Slow path.
58       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59 }
60 
resolveHTMLDecimalCharacterReference(StringRef Name) const61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62   unsigned CodePoint = 0;
63   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65     CodePoint *= 10;
66     CodePoint += Name[i] - '0';
67   }
68   return convertCodePointToUTF8(Allocator, CodePoint);
69 }
70 
resolveHTMLHexCharacterReference(StringRef Name) const71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72   unsigned CodePoint = 0;
73   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74     CodePoint *= 16;
75     const char C = Name[i];
76     assert(isHTMLHexCharacterReferenceCharacter(C));
77     CodePoint += llvm::hexDigitValue(C);
78   }
79   return convertCodePointToUTF8(Allocator, CodePoint);
80 }
81 
skipLineStartingDecorations()82 void Lexer::skipLineStartingDecorations() {
83   // This function should be called only for C comments
84   assert(CommentState == LCS_InsideCComment);
85 
86   if (BufferPtr == CommentEnd)
87     return;
88 
89   switch (*BufferPtr) {
90   case ' ':
91   case '\t':
92   case '\f':
93   case '\v': {
94     const char *NewBufferPtr = BufferPtr;
95     NewBufferPtr++;
96     if (NewBufferPtr == CommentEnd)
97       return;
98 
99     char C = *NewBufferPtr;
100     while (isHorizontalWhitespace(C)) {
101       NewBufferPtr++;
102       if (NewBufferPtr == CommentEnd)
103         return;
104       C = *NewBufferPtr;
105     }
106     if (C == '*')
107       BufferPtr = NewBufferPtr + 1;
108     break;
109   }
110   case '*':
111     BufferPtr++;
112     break;
113   }
114 }
115 
116 namespace {
117 /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120     if (isVerticalWhitespace(*BufferPtr))
121       return BufferPtr;
122   }
123   return BufferEnd;
124 }
125 
skipNewline(const char * BufferPtr,const char * BufferEnd)126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127   if (BufferPtr == BufferEnd)
128     return BufferPtr;
129 
130   if (*BufferPtr == '\n')
131     BufferPtr++;
132   else {
133     assert(*BufferPtr == '\r');
134     BufferPtr++;
135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136       BufferPtr++;
137   }
138   return BufferPtr;
139 }
140 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)141 const char *skipNamedCharacterReference(const char *BufferPtr,
142                                         const char *BufferEnd) {
143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145       return BufferPtr;
146   }
147   return BufferEnd;
148 }
149 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)150 const char *skipDecimalCharacterReference(const char *BufferPtr,
151                                           const char *BufferEnd) {
152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154       return BufferPtr;
155   }
156   return BufferEnd;
157 }
158 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)159 const char *skipHexCharacterReference(const char *BufferPtr,
160                                       const char *BufferEnd) {
161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163       return BufferPtr;
164   }
165   return BufferEnd;
166 }
167 
isHTMLIdentifierStartingCharacter(char C)168 bool isHTMLIdentifierStartingCharacter(char C) {
169   return isLetter(C);
170 }
171 
isHTMLIdentifierCharacter(char C)172 bool isHTMLIdentifierCharacter(char C) {
173   return isAlphanumeric(C);
174 }
175 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178     if (!isHTMLIdentifierCharacter(*BufferPtr))
179       return BufferPtr;
180   }
181   return BufferEnd;
182 }
183 
184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185 /// string allowed.
186 ///
187 /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189 {
190   const char Quote = *BufferPtr;
191   assert(Quote == '\"' || Quote == '\'');
192 
193   BufferPtr++;
194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195     const char C = *BufferPtr;
196     if (C == Quote && BufferPtr[-1] != '\\')
197       return BufferPtr;
198   }
199   return BufferEnd;
200 }
201 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204     if (!isWhitespace(*BufferPtr))
205       return BufferPtr;
206   }
207   return BufferEnd;
208 }
209 
isWhitespace(const char * BufferPtr,const char * BufferEnd)210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212 }
213 
isCommandNameStartCharacter(char C)214 bool isCommandNameStartCharacter(char C) {
215   return isLetter(C);
216 }
217 
isCommandNameCharacter(char C)218 bool isCommandNameCharacter(char C) {
219   return isAlphanumeric(C);
220 }
221 
skipCommandName(const char * BufferPtr,const char * BufferEnd)222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224     if (!isCommandNameCharacter(*BufferPtr))
225       return BufferPtr;
226   }
227   return BufferEnd;
228 }
229 
230 /// Return the one past end pointer for BCPL comments.
231 /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233   const char *CurPtr = BufferPtr;
234   while (CurPtr != BufferEnd) {
235     while (!isVerticalWhitespace(*CurPtr)) {
236       CurPtr++;
237       if (CurPtr == BufferEnd)
238         return BufferEnd;
239     }
240     // We found a newline, check if it is escaped.
241     const char *EscapePtr = CurPtr - 1;
242     while(isHorizontalWhitespace(*EscapePtr))
243       EscapePtr--;
244 
245     if (*EscapePtr == '\\' ||
246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248       // We found an escaped newline.
249       CurPtr = skipNewline(CurPtr, BufferEnd);
250     } else
251       return CurPtr; // Not an escaped newline.
252   }
253   return BufferEnd;
254 }
255 
256 /// Return the one past end pointer for C comments.
257 /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260     if (*BufferPtr == '*') {
261       assert(BufferPtr + 1 != BufferEnd);
262       if (*(BufferPtr + 1) == '/')
263         return BufferPtr;
264     }
265   }
266   llvm_unreachable("buffer end hit before '*/' was seen");
267 }
268 
269 } // unnamed namespace
270 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
272                                tok::TokenKind Kind) {
273   const unsigned TokLen = TokEnd - BufferPtr;
274   Result.setLocation(getSourceLocation(BufferPtr));
275   Result.setKind(Kind);
276   Result.setLength(TokLen);
277 #ifndef NDEBUG
278   Result.TextPtr = "<UNSET>";
279   Result.IntVal = 7;
280 #endif
281   BufferPtr = TokEnd;
282 }
283 
lexCommentText(Token & T)284 void Lexer::lexCommentText(Token &T) {
285   assert(CommentState == LCS_InsideBCPLComment ||
286          CommentState == LCS_InsideCComment);
287 
288   switch (State) {
289   case LS_Normal:
290     break;
291   case LS_VerbatimBlockFirstLine:
292     lexVerbatimBlockFirstLine(T);
293     return;
294   case LS_VerbatimBlockBody:
295     lexVerbatimBlockBody(T);
296     return;
297   case LS_VerbatimLineText:
298     lexVerbatimLineText(T);
299     return;
300   case LS_HTMLStartTag:
301     lexHTMLStartTag(T);
302     return;
303   case LS_HTMLEndTag:
304     lexHTMLEndTag(T);
305     return;
306   }
307 
308   assert(State == LS_Normal);
309 
310   const char *TokenPtr = BufferPtr;
311   assert(TokenPtr < CommentEnd);
312   while (TokenPtr != CommentEnd) {
313     switch(*TokenPtr) {
314       case '\\':
315       case '@': {
316         // Commands that start with a backslash and commands that start with
317         // 'at' have equivalent semantics.  But we keep information about the
318         // exact syntax in AST for comments.
319         tok::TokenKind CommandKind =
320             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
321         TokenPtr++;
322         if (TokenPtr == CommentEnd) {
323           formTextToken(T, TokenPtr);
324           return;
325         }
326         char C = *TokenPtr;
327         switch (C) {
328         default:
329           break;
330 
331         case '\\': case '@': case '&': case '$':
332         case '#':  case '<': case '>': case '%':
333         case '\"': case '.': case ':':
334           // This is one of \\ \@ \& \$ etc escape sequences.
335           TokenPtr++;
336           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
337             // This is the \:: escape sequence.
338             TokenPtr++;
339           }
340           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
341           formTokenWithChars(T, TokenPtr, tok::text);
342           T.setText(UnescapedText);
343           return;
344         }
345 
346         // Don't make zero-length commands.
347         if (!isCommandNameStartCharacter(*TokenPtr)) {
348           formTextToken(T, TokenPtr);
349           return;
350         }
351 
352         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
353         unsigned Length = TokenPtr - (BufferPtr + 1);
354 
355         // Hardcoded support for lexing LaTeX formula commands
356         // \f$ \f[ \f] \f{ \f} as a single command.
357         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
358           C = *TokenPtr;
359           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
360             TokenPtr++;
361             Length++;
362           }
363         }
364 
365         StringRef CommandName(BufferPtr + 1, Length);
366 
367         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
368         if (!Info) {
369           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
370             StringRef CorrectedName = Info->Name;
371             SourceLocation Loc = getSourceLocation(BufferPtr);
372             SourceRange CommandRange(Loc.getLocWithOffset(1),
373                                      getSourceLocation(TokenPtr));
374             Diag(Loc, diag::warn_correct_comment_command_name)
375               << CommandName << CorrectedName
376               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
377           } else {
378             formTokenWithChars(T, TokenPtr, tok::unknown_command);
379             T.setUnknownCommandName(CommandName);
380             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
381             return;
382           }
383         }
384         if (Info->IsVerbatimBlockCommand) {
385           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
386           return;
387         }
388         if (Info->IsVerbatimLineCommand) {
389           setupAndLexVerbatimLine(T, TokenPtr, Info);
390           return;
391         }
392         formTokenWithChars(T, TokenPtr, CommandKind);
393         T.setCommandID(Info->getID());
394         return;
395       }
396 
397       case '&':
398         lexHTMLCharacterReference(T);
399         return;
400 
401       case '<': {
402         TokenPtr++;
403         if (TokenPtr == CommentEnd) {
404           formTextToken(T, TokenPtr);
405           return;
406         }
407         const char C = *TokenPtr;
408         if (isHTMLIdentifierStartingCharacter(C))
409           setupAndLexHTMLStartTag(T);
410         else if (C == '/')
411           setupAndLexHTMLEndTag(T);
412         else
413           formTextToken(T, TokenPtr);
414 
415         return;
416       }
417 
418       case '\n':
419       case '\r':
420         TokenPtr = skipNewline(TokenPtr, CommentEnd);
421         formTokenWithChars(T, TokenPtr, tok::newline);
422 
423         if (CommentState == LCS_InsideCComment)
424           skipLineStartingDecorations();
425         return;
426 
427       default: {
428         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
429                          find_first_of("\n\r\\@&<");
430         if (End != StringRef::npos)
431           TokenPtr += End;
432         else
433           TokenPtr = CommentEnd;
434         formTextToken(T, TokenPtr);
435         return;
436       }
437     }
438   }
439 }
440 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)441 void Lexer::setupAndLexVerbatimBlock(Token &T,
442                                      const char *TextBegin,
443                                      char Marker, const CommandInfo *Info) {
444   assert(Info->IsVerbatimBlockCommand);
445 
446   VerbatimBlockEndCommandName.clear();
447   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
448   VerbatimBlockEndCommandName.append(Info->EndCommandName);
449 
450   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
451   T.setVerbatimBlockID(Info->getID());
452 
453   // If there is a newline following the verbatim opening command, skip the
454   // newline so that we don't create an tok::verbatim_block_line with empty
455   // text content.
456   if (BufferPtr != CommentEnd &&
457       isVerticalWhitespace(*BufferPtr)) {
458     BufferPtr = skipNewline(BufferPtr, CommentEnd);
459     State = LS_VerbatimBlockBody;
460     return;
461   }
462 
463   State = LS_VerbatimBlockFirstLine;
464 }
465 
lexVerbatimBlockFirstLine(Token & T)466 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
467 again:
468   assert(BufferPtr < CommentEnd);
469 
470   // FIXME: It would be better to scan the text once, finding either the block
471   // end command or newline.
472   //
473   // Extract current line.
474   const char *Newline = findNewline(BufferPtr, CommentEnd);
475   StringRef Line(BufferPtr, Newline - BufferPtr);
476 
477   // Look for end command in current line.
478   size_t Pos = Line.find(VerbatimBlockEndCommandName);
479   const char *TextEnd;
480   const char *NextLine;
481   if (Pos == StringRef::npos) {
482     // Current line is completely verbatim.
483     TextEnd = Newline;
484     NextLine = skipNewline(Newline, CommentEnd);
485   } else if (Pos == 0) {
486     // Current line contains just an end command.
487     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
488     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
489     formTokenWithChars(T, End, tok::verbatim_block_end);
490     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
491     State = LS_Normal;
492     return;
493   } else {
494     // There is some text, followed by end command.  Extract text first.
495     TextEnd = BufferPtr + Pos;
496     NextLine = TextEnd;
497     // If there is only whitespace before end command, skip whitespace.
498     if (isWhitespace(BufferPtr, TextEnd)) {
499       BufferPtr = TextEnd;
500       goto again;
501     }
502   }
503 
504   StringRef Text(BufferPtr, TextEnd - BufferPtr);
505   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
506   T.setVerbatimBlockText(Text);
507 
508   State = LS_VerbatimBlockBody;
509 }
510 
lexVerbatimBlockBody(Token & T)511 void Lexer::lexVerbatimBlockBody(Token &T) {
512   assert(State == LS_VerbatimBlockBody);
513 
514   if (CommentState == LCS_InsideCComment)
515     skipLineStartingDecorations();
516 
517   if (BufferPtr == CommentEnd) {
518     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
519     T.setVerbatimBlockText("");
520     return;
521   }
522 
523   lexVerbatimBlockFirstLine(T);
524 }
525 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)526 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
527                                     const CommandInfo *Info) {
528   assert(Info->IsVerbatimLineCommand);
529   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
530   T.setVerbatimLineID(Info->getID());
531 
532   State = LS_VerbatimLineText;
533 }
534 
lexVerbatimLineText(Token & T)535 void Lexer::lexVerbatimLineText(Token &T) {
536   assert(State == LS_VerbatimLineText);
537 
538   // Extract current line.
539   const char *Newline = findNewline(BufferPtr, CommentEnd);
540   StringRef Text(BufferPtr, Newline - BufferPtr);
541   formTokenWithChars(T, Newline, tok::verbatim_line_text);
542   T.setVerbatimLineText(Text);
543 
544   State = LS_Normal;
545 }
546 
lexHTMLCharacterReference(Token & T)547 void Lexer::lexHTMLCharacterReference(Token &T) {
548   const char *TokenPtr = BufferPtr;
549   assert(*TokenPtr == '&');
550   TokenPtr++;
551   if (TokenPtr == CommentEnd) {
552     formTextToken(T, TokenPtr);
553     return;
554   }
555   const char *NamePtr;
556   bool isNamed = false;
557   bool isDecimal = false;
558   char C = *TokenPtr;
559   if (isHTMLNamedCharacterReferenceCharacter(C)) {
560     NamePtr = TokenPtr;
561     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
562     isNamed = true;
563   } else if (C == '#') {
564     TokenPtr++;
565     if (TokenPtr == CommentEnd) {
566       formTextToken(T, TokenPtr);
567       return;
568     }
569     C = *TokenPtr;
570     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
571       NamePtr = TokenPtr;
572       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
573       isDecimal = true;
574     } else if (C == 'x' || C == 'X') {
575       TokenPtr++;
576       NamePtr = TokenPtr;
577       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
578     } else {
579       formTextToken(T, TokenPtr);
580       return;
581     }
582   } else {
583     formTextToken(T, TokenPtr);
584     return;
585   }
586   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
587       *TokenPtr != ';') {
588     formTextToken(T, TokenPtr);
589     return;
590   }
591   StringRef Name(NamePtr, TokenPtr - NamePtr);
592   TokenPtr++; // Skip semicolon.
593   StringRef Resolved;
594   if (isNamed)
595     Resolved = resolveHTMLNamedCharacterReference(Name);
596   else if (isDecimal)
597     Resolved = resolveHTMLDecimalCharacterReference(Name);
598   else
599     Resolved = resolveHTMLHexCharacterReference(Name);
600 
601   if (Resolved.empty()) {
602     formTextToken(T, TokenPtr);
603     return;
604   }
605   formTokenWithChars(T, TokenPtr, tok::text);
606   T.setText(Resolved);
607   return;
608 }
609 
setupAndLexHTMLStartTag(Token & T)610 void Lexer::setupAndLexHTMLStartTag(Token &T) {
611   assert(BufferPtr[0] == '<' &&
612          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
613   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
614   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
615   if (!isHTMLTagName(Name)) {
616     formTextToken(T, TagNameEnd);
617     return;
618   }
619 
620   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
621   T.setHTMLTagStartName(Name);
622 
623   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
624 
625   const char C = *BufferPtr;
626   if (BufferPtr != CommentEnd &&
627       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
628     State = LS_HTMLStartTag;
629 }
630 
lexHTMLStartTag(Token & T)631 void Lexer::lexHTMLStartTag(Token &T) {
632   assert(State == LS_HTMLStartTag);
633 
634   const char *TokenPtr = BufferPtr;
635   char C = *TokenPtr;
636   if (isHTMLIdentifierCharacter(C)) {
637     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
638     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
639     formTokenWithChars(T, TokenPtr, tok::html_ident);
640     T.setHTMLIdent(Ident);
641   } else {
642     switch (C) {
643     case '=':
644       TokenPtr++;
645       formTokenWithChars(T, TokenPtr, tok::html_equals);
646       break;
647     case '\"':
648     case '\'': {
649       const char *OpenQuote = TokenPtr;
650       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
651       const char *ClosingQuote = TokenPtr;
652       if (TokenPtr != CommentEnd) // Skip closing quote.
653         TokenPtr++;
654       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
655       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
656                                       ClosingQuote - (OpenQuote + 1)));
657       break;
658     }
659     case '>':
660       TokenPtr++;
661       formTokenWithChars(T, TokenPtr, tok::html_greater);
662       State = LS_Normal;
663       return;
664     case '/':
665       TokenPtr++;
666       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
667         TokenPtr++;
668         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
669       } else
670         formTextToken(T, TokenPtr);
671 
672       State = LS_Normal;
673       return;
674     }
675   }
676 
677   // Now look ahead and return to normal state if we don't see any HTML tokens
678   // ahead.
679   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
680   if (BufferPtr == CommentEnd) {
681     State = LS_Normal;
682     return;
683   }
684 
685   C = *BufferPtr;
686   if (!isHTMLIdentifierStartingCharacter(C) &&
687       C != '=' && C != '\"' && C != '\'' && C != '>') {
688     State = LS_Normal;
689     return;
690   }
691 }
692 
setupAndLexHTMLEndTag(Token & T)693 void Lexer::setupAndLexHTMLEndTag(Token &T) {
694   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
695 
696   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
697   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
698   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
699   if (!isHTMLTagName(Name)) {
700     formTextToken(T, TagNameEnd);
701     return;
702   }
703 
704   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
705 
706   formTokenWithChars(T, End, tok::html_end_tag);
707   T.setHTMLTagEndName(Name);
708 
709   if (BufferPtr != CommentEnd && *BufferPtr == '>')
710     State = LS_HTMLEndTag;
711 }
712 
lexHTMLEndTag(Token & T)713 void Lexer::lexHTMLEndTag(Token &T) {
714   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
715 
716   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
717   State = LS_Normal;
718 }
719 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)720 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
721              const CommandTraits &Traits,
722              SourceLocation FileLoc,
723              const char *BufferStart, const char *BufferEnd):
724     Allocator(Allocator), Diags(Diags), Traits(Traits),
725     BufferStart(BufferStart), BufferEnd(BufferEnd),
726     FileLoc(FileLoc), BufferPtr(BufferStart),
727     CommentState(LCS_BeforeComment), State(LS_Normal) {
728 }
729 
lex(Token & T)730 void Lexer::lex(Token &T) {
731 again:
732   switch (CommentState) {
733   case LCS_BeforeComment:
734     if (BufferPtr == BufferEnd) {
735       formTokenWithChars(T, BufferPtr, tok::eof);
736       return;
737     }
738 
739     assert(*BufferPtr == '/');
740     BufferPtr++; // Skip first slash.
741     switch(*BufferPtr) {
742     case '/': { // BCPL comment.
743       BufferPtr++; // Skip second slash.
744 
745       if (BufferPtr != BufferEnd) {
746         // Skip Doxygen magic marker, if it is present.
747         // It might be missing because of a typo //< or /*<, or because we
748         // merged this non-Doxygen comment into a bunch of Doxygen comments
749         // around it: /** ... */ /* ... */ /** ... */
750         const char C = *BufferPtr;
751         if (C == '/' || C == '!')
752           BufferPtr++;
753       }
754 
755       // Skip less-than symbol that marks trailing comments.
756       // Skip it even if the comment is not a Doxygen one, because //< and /*<
757       // are frequent typos.
758       if (BufferPtr != BufferEnd && *BufferPtr == '<')
759         BufferPtr++;
760 
761       CommentState = LCS_InsideBCPLComment;
762       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
763         State = LS_Normal;
764       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
765       goto again;
766     }
767     case '*': { // C comment.
768       BufferPtr++; // Skip star.
769 
770       // Skip Doxygen magic marker.
771       const char C = *BufferPtr;
772       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
773         BufferPtr++;
774 
775       // Skip less-than symbol that marks trailing comments.
776       if (BufferPtr != BufferEnd && *BufferPtr == '<')
777         BufferPtr++;
778 
779       CommentState = LCS_InsideCComment;
780       State = LS_Normal;
781       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
782       goto again;
783     }
784     default:
785       llvm_unreachable("second character of comment should be '/' or '*'");
786     }
787 
788   case LCS_BetweenComments: {
789     // Consecutive comments are extracted only if there is only whitespace
790     // between them.  So we can search for the start of the next comment.
791     const char *EndWhitespace = BufferPtr;
792     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
793       EndWhitespace++;
794 
795     // Turn any whitespace between comments (and there is only whitespace
796     // between them -- guaranteed by comment extraction) into a newline.  We
797     // have two newlines between C comments in total (first one was synthesized
798     // after a comment).
799     formTokenWithChars(T, EndWhitespace, tok::newline);
800 
801     CommentState = LCS_BeforeComment;
802     break;
803   }
804 
805   case LCS_InsideBCPLComment:
806   case LCS_InsideCComment:
807     if (BufferPtr != CommentEnd) {
808       lexCommentText(T);
809       break;
810     } else {
811       // Skip C comment closing sequence.
812       if (CommentState == LCS_InsideCComment) {
813         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
814         BufferPtr += 2;
815         assert(BufferPtr <= BufferEnd);
816 
817         // Synthenize newline just after the C comment, regardless if there is
818         // actually a newline.
819         formTokenWithChars(T, BufferPtr, tok::newline);
820 
821         CommentState = LCS_BetweenComments;
822         break;
823       } else {
824         // Don't synthesized a newline after BCPL comment.
825         CommentState = LCS_BetweenComments;
826         goto again;
827       }
828     }
829   }
830 }
831 
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const832 StringRef Lexer::getSpelling(const Token &Tok,
833                              const SourceManager &SourceMgr,
834                              bool *Invalid) const {
835   SourceLocation Loc = Tok.getLocation();
836   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
837 
838   bool InvalidTemp = false;
839   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
840   if (InvalidTemp) {
841     *Invalid = true;
842     return StringRef();
843   }
844 
845   const char *Begin = File.data() + LocInfo.second;
846   return StringRef(Begin, Tok.getLength());
847 }
848 
849 } // end namespace comments
850 } // end namespace clang
851 
852