1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 //  This file defines lexer for structured comments and supporting token class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
15 #define LLVM_CLANG_AST_COMMENTLEXER_H
16 
17 #include "clang/Basic/Diagnostic.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/Support/Allocator.h"
23 #include "llvm/Support/raw_ostream.h"
24 
25 namespace clang {
26 namespace comments {
27 
28 class Lexer;
29 class TextTokenRetokenizer;
30 struct CommandInfo;
31 class CommandTraits;
32 
33 namespace tok {
34 enum TokenKind {
35   eof,
36   newline,
37   text,
38   unknown_command,   // Command that does not have an ID.
39   backslash_command, // Command with an ID, that used backslash marker.
40   at_command,        // Command with an ID, that used 'at' marker.
41   verbatim_block_begin,
42   verbatim_block_line,
43   verbatim_block_end,
44   verbatim_line_name,
45   verbatim_line_text,
46   html_start_tag,     // <tag
47   html_ident,         // attr
48   html_equals,        // =
49   html_quoted_string, // "blah\"blah" or 'blah\'blah'
50   html_greater,       // >
51   html_slash_greater, // />
52   html_end_tag        // </tag
53 };
54 } // end namespace tok
55 
56 /// \brief Comment token.
57 class Token {
58   friend class Lexer;
59   friend class TextTokenRetokenizer;
60 
61   /// The location of the token.
62   SourceLocation Loc;
63 
64   /// The actual kind of the token.
65   tok::TokenKind Kind;
66 
67   /// Length of the token spelling in comment.  Can be 0 for synthenized
68   /// tokens.
69   unsigned Length;
70 
71   /// Contains text value associated with a token.
72   const char *TextPtr;
73 
74   /// Integer value associated with a token.
75   ///
76   /// If the token is a konwn command, contains command ID and TextPtr is
77   /// unused (command spelling can be found with CommandTraits).  Otherwise,
78   /// contains the length of the string that starts at TextPtr.
79   unsigned IntVal;
80 
81 public:
getLocation()82   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
setLocation(SourceLocation SL)83   void setLocation(SourceLocation SL) { Loc = SL; }
84 
getEndLocation()85   SourceLocation getEndLocation() const LLVM_READONLY {
86     if (Length == 0 || Length == 1)
87       return Loc;
88     return Loc.getLocWithOffset(Length - 1);
89   }
90 
getKind()91   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
setKind(tok::TokenKind K)92   void setKind(tok::TokenKind K) { Kind = K; }
93 
is(tok::TokenKind K)94   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
isNot(tok::TokenKind K)95   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
96 
getLength()97   unsigned getLength() const LLVM_READONLY { return Length; }
setLength(unsigned L)98   void setLength(unsigned L) { Length = L; }
99 
getText()100   StringRef getText() const LLVM_READONLY {
101     assert(is(tok::text));
102     return StringRef(TextPtr, IntVal);
103   }
104 
setText(StringRef Text)105   void setText(StringRef Text) {
106     assert(is(tok::text));
107     TextPtr = Text.data();
108     IntVal = Text.size();
109   }
110 
getUnknownCommandName()111   StringRef getUnknownCommandName() const LLVM_READONLY {
112     assert(is(tok::unknown_command));
113     return StringRef(TextPtr, IntVal);
114   }
115 
setUnknownCommandName(StringRef Name)116   void setUnknownCommandName(StringRef Name) {
117     assert(is(tok::unknown_command));
118     TextPtr = Name.data();
119     IntVal = Name.size();
120   }
121 
getCommandID()122   unsigned getCommandID() const LLVM_READONLY {
123     assert(is(tok::backslash_command) || is(tok::at_command));
124     return IntVal;
125   }
126 
setCommandID(unsigned ID)127   void setCommandID(unsigned ID) {
128     assert(is(tok::backslash_command) || is(tok::at_command));
129     IntVal = ID;
130   }
131 
getVerbatimBlockID()132   unsigned getVerbatimBlockID() const LLVM_READONLY {
133     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
134     return IntVal;
135   }
136 
setVerbatimBlockID(unsigned ID)137   void setVerbatimBlockID(unsigned ID) {
138     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
139     IntVal = ID;
140   }
141 
getVerbatimBlockText()142   StringRef getVerbatimBlockText() const LLVM_READONLY {
143     assert(is(tok::verbatim_block_line));
144     return StringRef(TextPtr, IntVal);
145   }
146 
setVerbatimBlockText(StringRef Text)147   void setVerbatimBlockText(StringRef Text) {
148     assert(is(tok::verbatim_block_line));
149     TextPtr = Text.data();
150     IntVal = Text.size();
151   }
152 
getVerbatimLineID()153   unsigned getVerbatimLineID() const LLVM_READONLY {
154     assert(is(tok::verbatim_line_name));
155     return IntVal;
156   }
157 
setVerbatimLineID(unsigned ID)158   void setVerbatimLineID(unsigned ID) {
159     assert(is(tok::verbatim_line_name));
160     IntVal = ID;
161   }
162 
getVerbatimLineText()163   StringRef getVerbatimLineText() const LLVM_READONLY {
164     assert(is(tok::verbatim_line_text));
165     return StringRef(TextPtr, IntVal);
166   }
167 
setVerbatimLineText(StringRef Text)168   void setVerbatimLineText(StringRef Text) {
169     assert(is(tok::verbatim_line_text));
170     TextPtr = Text.data();
171     IntVal = Text.size();
172   }
173 
getHTMLTagStartName()174   StringRef getHTMLTagStartName() const LLVM_READONLY {
175     assert(is(tok::html_start_tag));
176     return StringRef(TextPtr, IntVal);
177   }
178 
setHTMLTagStartName(StringRef Name)179   void setHTMLTagStartName(StringRef Name) {
180     assert(is(tok::html_start_tag));
181     TextPtr = Name.data();
182     IntVal = Name.size();
183   }
184 
getHTMLIdent()185   StringRef getHTMLIdent() const LLVM_READONLY {
186     assert(is(tok::html_ident));
187     return StringRef(TextPtr, IntVal);
188   }
189 
setHTMLIdent(StringRef Name)190   void setHTMLIdent(StringRef Name) {
191     assert(is(tok::html_ident));
192     TextPtr = Name.data();
193     IntVal = Name.size();
194   }
195 
getHTMLQuotedString()196   StringRef getHTMLQuotedString() const LLVM_READONLY {
197     assert(is(tok::html_quoted_string));
198     return StringRef(TextPtr, IntVal);
199   }
200 
setHTMLQuotedString(StringRef Str)201   void setHTMLQuotedString(StringRef Str) {
202     assert(is(tok::html_quoted_string));
203     TextPtr = Str.data();
204     IntVal = Str.size();
205   }
206 
getHTMLTagEndName()207   StringRef getHTMLTagEndName() const LLVM_READONLY {
208     assert(is(tok::html_end_tag));
209     return StringRef(TextPtr, IntVal);
210   }
211 
setHTMLTagEndName(StringRef Name)212   void setHTMLTagEndName(StringRef Name) {
213     assert(is(tok::html_end_tag));
214     TextPtr = Name.data();
215     IntVal = Name.size();
216   }
217 
218   void dump(const Lexer &L, const SourceManager &SM) const;
219 };
220 
221 /// \brief Comment lexer.
222 class Lexer {
223 private:
224   Lexer(const Lexer &) = delete;
225   void operator=(const Lexer &) = delete;
226 
227   /// Allocator for strings that are semantic values of tokens and have to be
228   /// computed (for example, resolved decimal character references).
229   llvm::BumpPtrAllocator &Allocator;
230 
231   DiagnosticsEngine &Diags;
232 
233   const CommandTraits &Traits;
234 
235   const char *const BufferStart;
236   const char *const BufferEnd;
237   SourceLocation FileLoc;
238 
239   const char *BufferPtr;
240 
241   /// One past end pointer for the current comment.  For BCPL comments points
242   /// to newline or BufferEnd, for C comments points to star in '*/'.
243   const char *CommentEnd;
244 
245   enum LexerCommentState {
246     LCS_BeforeComment,
247     LCS_InsideBCPLComment,
248     LCS_InsideCComment,
249     LCS_BetweenComments
250   };
251 
252   /// Low-level lexer state, track if we are inside or outside of comment.
253   LexerCommentState CommentState;
254 
255   enum LexerState {
256     /// Lexing normal comment text
257     LS_Normal,
258 
259     /// Finished lexing verbatim block beginning command, will lex first body
260     /// line.
261     LS_VerbatimBlockFirstLine,
262 
263     /// Lexing verbatim block body line-by-line, skipping line-starting
264     /// decorations.
265     LS_VerbatimBlockBody,
266 
267     /// Finished lexing verbatim line beginning command, will lex text (one
268     /// line).
269     LS_VerbatimLineText,
270 
271     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
272     LS_HTMLStartTag,
273 
274     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
275     LS_HTMLEndTag
276   };
277 
278   /// Current lexing mode.
279   LexerState State;
280 
281   /// If State is LS_VerbatimBlock, contains the name of verbatim end
282   /// command, including command marker.
283   SmallString<16> VerbatimBlockEndCommandName;
284 
285   /// Given a character reference name (e.g., "lt"), return the character that
286   /// it stands for (e.g., "<").
287   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
288 
289   /// Given a Unicode codepoint as base-10 integer, return the character.
290   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
291 
292   /// Given a Unicode codepoint as base-16 integer, return the character.
293   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
294 
295   void formTokenWithChars(Token &Result, const char *TokEnd,
296                           tok::TokenKind Kind);
297 
formTextToken(Token & Result,const char * TokEnd)298   void formTextToken(Token &Result, const char *TokEnd) {
299     StringRef Text(BufferPtr, TokEnd - BufferPtr);
300     formTokenWithChars(Result, TokEnd, tok::text);
301     Result.setText(Text);
302   }
303 
getSourceLocation(const char * Loc)304   SourceLocation getSourceLocation(const char *Loc) const {
305     assert(Loc >= BufferStart && Loc <= BufferEnd &&
306            "Location out of range for this buffer!");
307 
308     const unsigned CharNo = Loc - BufferStart;
309     return FileLoc.getLocWithOffset(CharNo);
310   }
311 
Diag(SourceLocation Loc,unsigned DiagID)312   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
313     return Diags.Report(Loc, DiagID);
314   }
315 
316   /// Eat string matching regexp \code \s*\* \endcode.
317   void skipLineStartingDecorations();
318 
319   /// Lex stuff inside comments.  CommentEnd should be set correctly.
320   void lexCommentText(Token &T);
321 
322   void setupAndLexVerbatimBlock(Token &T,
323                                 const char *TextBegin,
324                                 char Marker, const CommandInfo *Info);
325 
326   void lexVerbatimBlockFirstLine(Token &T);
327 
328   void lexVerbatimBlockBody(Token &T);
329 
330   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
331                                const CommandInfo *Info);
332 
333   void lexVerbatimLineText(Token &T);
334 
335   void lexHTMLCharacterReference(Token &T);
336 
337   void setupAndLexHTMLStartTag(Token &T);
338 
339   void lexHTMLStartTag(Token &T);
340 
341   void setupAndLexHTMLEndTag(Token &T);
342 
343   void lexHTMLEndTag(Token &T);
344 
345 public:
346   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
347         const CommandTraits &Traits,
348         SourceLocation FileLoc,
349         const char *BufferStart, const char *BufferEnd);
350 
351   void lex(Token &T);
352 
353   StringRef getSpelling(const Token &Tok,
354                         const SourceManager &SourceMgr,
355                         bool *Invalid = nullptr) const;
356 };
357 
358 } // end namespace comments
359 } // end namespace clang
360 
361 #endif
362 
363