1 // -*- mode: C++ -*-
2 
3 // Copyright (c) 2010 Google Inc. All Rights Reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // CFI reader author: Jim Blandy <jimb@mozilla.com> <jimb@red-bean.com>
32 
33 // This file contains definitions related to the DWARF2/3 reader and
34 // it's handler interfaces.
35 // The DWARF2/3 specification can be found at
36 // http://dwarf.freestandards.org and should be considered required
37 // reading if you wish to modify the implementation.
38 // Only a cursory attempt is made to explain terminology that is
39 // used here, as it is much better explained in the standard documents
40 #ifndef COMMON_DWARF_DWARF2READER_H__
41 #define COMMON_DWARF_DWARF2READER_H__
42 
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <utility>
47 #include <vector>
48 
49 #include "common/dwarf/bytereader.h"
50 #include "common/dwarf/dwarf2enums.h"
51 #include "common/dwarf/types.h"
52 #include "common/using_std_string.h"
53 
54 namespace dwarf2reader {
55 struct LineStateMachine;
56 class Dwarf2Handler;
57 class LineInfoHandler;
58 
59 // This maps from a string naming a section to a pair containing a
60 // the data for the section, and the size of the section.
61 typedef std::map<string, std::pair<const char*, uint64> > SectionMap;
62 typedef std::list<std::pair<enum DwarfAttribute, enum DwarfForm> >
63     AttributeList;
64 typedef AttributeList::iterator AttributeIterator;
65 typedef AttributeList::const_iterator ConstAttributeIterator;
66 
67 struct LineInfoHeader {
68   uint64 total_length;
69   uint16 version;
70   uint64 prologue_length;
71   uint8 min_insn_length; // insn stands for instructin
72   bool default_is_stmt; // stmt stands for statement
73   int8 line_base;
74   uint8 line_range;
75   uint8 opcode_base;
76   // Use a pointer so that signalsafe_addr2line is able to use this structure
77   // without heap allocation problem.
78   std::vector<unsigned char> *std_opcode_lengths;
79 };
80 
81 class LineInfo {
82  public:
83 
84   // Initializes a .debug_line reader. Buffer and buffer length point
85   // to the beginning and length of the line information to read.
86   // Reader is a ByteReader class that has the endianness set
87   // properly.
88   LineInfo(const char* buffer_, uint64 buffer_length,
89            ByteReader* reader, LineInfoHandler* handler);
90 
~LineInfo()91   virtual ~LineInfo() {
92     if (header_.std_opcode_lengths) {
93       delete header_.std_opcode_lengths;
94     }
95   }
96 
97   // Start processing line info, and calling callbacks in the handler.
98   // Consumes the line number information for a single compilation unit.
99   // Returns the number of bytes processed.
100   uint64 Start();
101 
102   // Process a single line info opcode at START using the state
103   // machine at LSM.  Return true if we should define a line using the
104   // current state of the line state machine.  Place the length of the
105   // opcode in LEN.
106   // If LSM_PASSES_PC is non-NULL, this function also checks if the lsm
107   // passes the address of PC. In other words, LSM_PASSES_PC will be
108   // set to true, if the following condition is met.
109   //
110   // lsm's old address < PC <= lsm's new address
111   static bool ProcessOneOpcode(ByteReader* reader,
112                                LineInfoHandler* handler,
113                                const struct LineInfoHeader &header,
114                                const char* start,
115                                struct LineStateMachine* lsm,
116                                size_t* len,
117                                uintptr pc,
118                                bool *lsm_passes_pc);
119 
120  private:
121   // Reads the DWARF2/3 header for this line info.
122   void ReadHeader();
123 
124   // Reads the DWARF2/3 line information
125   void ReadLines();
126 
127   // The associated handler to call processing functions in
128   LineInfoHandler* handler_;
129 
130   // The associated ByteReader that handles endianness issues for us
131   ByteReader* reader_;
132 
133   // A DWARF2/3 line info header.  This is not the same size as
134   // in the actual file, as the one in the file may have a 32 bit or
135   // 64 bit lengths
136 
137   struct LineInfoHeader header_;
138 
139   // buffer is the buffer for our line info, starting at exactly where
140   // the line info to read is.  after_header is the place right after
141   // the end of the line information header.
142   const char* buffer_;
143   uint64 buffer_length_;
144   const char* after_header_;
145 };
146 
147 // This class is the main interface between the line info reader and
148 // the client.  The virtual functions inside this get called for
149 // interesting events that happen during line info reading.  The
150 // default implementation does nothing
151 
152 class LineInfoHandler {
153  public:
LineInfoHandler()154   LineInfoHandler() { }
155 
~LineInfoHandler()156   virtual ~LineInfoHandler() { }
157 
158   // Called when we define a directory.  NAME is the directory name,
159   // DIR_NUM is the directory number
DefineDir(const string & name,uint32 dir_num)160   virtual void DefineDir(const string& name, uint32 dir_num) { }
161 
162   // Called when we define a filename. NAME is the filename, FILE_NUM
163   // is the file number which is -1 if the file index is the next
164   // index after the last numbered index (this happens when files are
165   // dynamically defined by the line program), DIR_NUM is the
166   // directory index for the directory name of this file, MOD_TIME is
167   // the modification time of the file, and LENGTH is the length of
168   // the file
DefineFile(const string & name,int32 file_num,uint32 dir_num,uint64 mod_time,uint64 length)169   virtual void DefineFile(const string& name, int32 file_num,
170                           uint32 dir_num, uint64 mod_time,
171                           uint64 length) { }
172 
173   // Called when the line info reader has a new line, address pair
174   // ready for us. ADDRESS is the address of the code, LENGTH is the
175   // length of its machine code in bytes, FILE_NUM is the file number
176   // containing the code, LINE_NUM is the line number in that file for
177   // the code, and COLUMN_NUM is the column number the code starts at,
178   // if we know it (0 otherwise).
AddLine(uint64 address,uint64 length,uint32 file_num,uint32 line_num,uint32 column_num)179   virtual void AddLine(uint64 address, uint64 length,
180                        uint32 file_num, uint32 line_num, uint32 column_num) { }
181 };
182 
183 // The base of DWARF2/3 debug info is a DIE (Debugging Information
184 // Entry.
185 // DWARF groups DIE's into a tree and calls the root of this tree a
186 // "compilation unit".  Most of the time, there is one compilation
187 // unit in the .debug_info section for each file that had debug info
188 // generated.
189 // Each DIE consists of
190 
191 // 1. a tag specifying a thing that is being described (ie
192 // DW_TAG_subprogram for functions, DW_TAG_variable for variables, etc
193 // 2. attributes (such as DW_AT_location for location in memory,
194 // DW_AT_name for name), and data for each attribute.
195 // 3. A flag saying whether the DIE has children or not
196 
197 // In order to gain some amount of compression, the format of
198 // each DIE (tag name, attributes and data forms for the attributes)
199 // are stored in a separate table called the "abbreviation table".
200 // This is done because a large number of DIEs have the exact same tag
201 // and list of attributes, but different data for those attributes.
202 // As a result, the .debug_info section is just a stream of data, and
203 // requires reading of the .debug_abbrev section to say what the data
204 // means.
205 
206 // As a warning to the user, it should be noted that the reason for
207 // using absolute offsets from the beginning of .debug_info is that
208 // DWARF2/3 supports referencing DIE's from other DIE's by their offset
209 // from either the current compilation unit start, *or* the beginning
210 // of the .debug_info section.  This means it is possible to reference
211 // a DIE in one compilation unit from a DIE in another compilation
212 // unit.  This style of reference is usually used to eliminate
213 // duplicated information that occurs across compilation
214 // units, such as base types, etc.  GCC 3.4+ support this with
215 // -feliminate-dwarf2-dups.  Other toolchains will sometimes do
216 // duplicate elimination in the linker.
217 
218 class CompilationUnit {
219  public:
220 
221   // Initialize a compilation unit.  This requires a map of sections,
222   // the offset of this compilation unit in the .debug_info section, a
223   // ByteReader, and a Dwarf2Handler class to call callbacks in.
224   CompilationUnit(const SectionMap& sections, uint64 offset,
225                   ByteReader* reader, Dwarf2Handler* handler);
~CompilationUnit()226   virtual ~CompilationUnit() {
227     if (abbrevs_) delete abbrevs_;
228   }
229 
230   // Begin reading a Dwarf2 compilation unit, and calling the
231   // callbacks in the Dwarf2Handler
232 
233   // Return the full length of the compilation unit, including
234   // headers. This plus the starting offset passed to the constructor
235   // is the offset of the end of the compilation unit --- and the
236   // start of the next compilation unit, if there is one.
237   uint64 Start();
238 
239  private:
240 
241   // This struct represents a single DWARF2/3 abbreviation
242   // The abbreviation tells how to read a DWARF2/3 DIE, and consist of a
243   // tag and a list of attributes, as well as the data form of each attribute.
244   struct Abbrev {
245     uint64 number;
246     enum DwarfTag tag;
247     bool has_children;
248     AttributeList attributes;
249   };
250 
251   // A DWARF2/3 compilation unit header.  This is not the same size as
252   // in the actual file, as the one in the file may have a 32 bit or
253   // 64 bit length.
254   struct CompilationUnitHeader {
255     uint64 length;
256     uint16 version;
257     uint64 abbrev_offset;
258     uint8 address_size;
259   } header_;
260 
261   // Reads the DWARF2/3 header for this compilation unit.
262   void ReadHeader();
263 
264   // Reads the DWARF2/3 abbreviations for this compilation unit
265   void ReadAbbrevs();
266 
267   // Processes a single DIE for this compilation unit and return a new
268   // pointer just past the end of it
269   const char* ProcessDIE(uint64 dieoffset,
270                                   const char* start,
271                                   const Abbrev& abbrev);
272 
273   // Processes a single attribute and return a new pointer just past the
274   // end of it
275   const char* ProcessAttribute(uint64 dieoffset,
276                                         const char* start,
277                                         enum DwarfAttribute attr,
278                                         enum DwarfForm form);
279 
280   // Processes all DIEs for this compilation unit
281   void ProcessDIEs();
282 
283   // Skips the die with attributes specified in ABBREV starting at
284   // START, and return the new place to position the stream to.
285   const char* SkipDIE(const char* start,
286                                const Abbrev& abbrev);
287 
288   // Skips the attribute starting at START, with FORM, and return the
289   // new place to position the stream to.
290   const char* SkipAttribute(const char* start,
291                                      enum DwarfForm form);
292 
293   // Offset from section start is the offset of this compilation unit
294   // from the beginning of the .debug_info section.
295   uint64 offset_from_section_start_;
296 
297   // buffer is the buffer for our CU, starting at .debug_info + offset
298   // passed in from constructor.
299   // after_header points to right after the compilation unit header.
300   const char* buffer_;
301   uint64 buffer_length_;
302   const char* after_header_;
303 
304   // The associated ByteReader that handles endianness issues for us
305   ByteReader* reader_;
306 
307   // The map of sections in our file to buffers containing their data
308   const SectionMap& sections_;
309 
310   // The associated handler to call processing functions in
311   Dwarf2Handler* handler_;
312 
313   // Set of DWARF2/3 abbreviations for this compilation unit.  Indexed
314   // by abbreviation number, which means that abbrevs_[0] is not
315   // valid.
316   std::vector<Abbrev>* abbrevs_;
317 
318   // String section buffer and length, if we have a string section.
319   // This is here to avoid doing a section lookup for strings in
320   // ProcessAttribute, which is in the hot path for DWARF2 reading.
321   const char* string_buffer_;
322   uint64 string_buffer_length_;
323 };
324 
325 // This class is the main interface between the reader and the
326 // client.  The virtual functions inside this get called for
327 // interesting events that happen during DWARF2 reading.
328 // The default implementation skips everything.
329 
330 class Dwarf2Handler {
331  public:
Dwarf2Handler()332   Dwarf2Handler() { }
333 
~Dwarf2Handler()334   virtual ~Dwarf2Handler() { }
335 
336   // Start to process a compilation unit at OFFSET from the beginning of the
337   // .debug_info section. Return false if you would like to skip this
338   // compilation unit.
StartCompilationUnit(uint64 offset,uint8 address_size,uint8 offset_size,uint64 cu_length,uint8 dwarf_version)339   virtual bool StartCompilationUnit(uint64 offset, uint8 address_size,
340                                     uint8 offset_size, uint64 cu_length,
341                                     uint8 dwarf_version) { return false; }
342 
343   // Start to process a DIE at OFFSET from the beginning of the .debug_info
344   // section. Return false if you would like to skip this DIE.
StartDIE(uint64 offset,enum DwarfTag tag)345   virtual bool StartDIE(uint64 offset, enum DwarfTag tag) { return false; }
346 
347   // Called when we have an attribute with unsigned data to give to our
348   // handler. The attribute is for the DIE at OFFSET from the beginning of the
349   // .debug_info section. Its name is ATTR, its form is FORM, and its value is
350   // DATA.
ProcessAttributeUnsigned(uint64 offset,enum DwarfAttribute attr,enum DwarfForm form,uint64 data)351   virtual void ProcessAttributeUnsigned(uint64 offset,
352                                         enum DwarfAttribute attr,
353                                         enum DwarfForm form,
354                                         uint64 data) { }
355 
356   // Called when we have an attribute with signed data to give to our handler.
357   // The attribute is for the DIE at OFFSET from the beginning of the
358   // .debug_info section. Its name is ATTR, its form is FORM, and its value is
359   // DATA.
ProcessAttributeSigned(uint64 offset,enum DwarfAttribute attr,enum DwarfForm form,int64 data)360   virtual void ProcessAttributeSigned(uint64 offset,
361                                       enum DwarfAttribute attr,
362                                       enum DwarfForm form,
363                                       int64 data) { }
364 
365   // Called when we have an attribute whose value is a reference to
366   // another DIE. The attribute belongs to the DIE at OFFSET from the
367   // beginning of the .debug_info section. Its name is ATTR, its form
368   // is FORM, and the offset of the DIE being referred to from the
369   // beginning of the .debug_info section is DATA.
ProcessAttributeReference(uint64 offset,enum DwarfAttribute attr,enum DwarfForm form,uint64 data)370   virtual void ProcessAttributeReference(uint64 offset,
371                                          enum DwarfAttribute attr,
372                                          enum DwarfForm form,
373                                          uint64 data) { }
374 
375   // Called when we have an attribute with a buffer of data to give to our
376   // handler. The attribute is for the DIE at OFFSET from the beginning of the
377   // .debug_info section. Its name is ATTR, its form is FORM, DATA points to
378   // the buffer's contents, and its length in bytes is LENGTH. The buffer is
379   // owned by the caller, not the callee, and may not persist for very long.
380   // If you want the data to be available later, it needs to be copied.
ProcessAttributeBuffer(uint64 offset,enum DwarfAttribute attr,enum DwarfForm form,const char * data,uint64 len)381   virtual void ProcessAttributeBuffer(uint64 offset,
382                                       enum DwarfAttribute attr,
383                                       enum DwarfForm form,
384                                       const char* data,
385                                       uint64 len) { }
386 
387   // Called when we have an attribute with string data to give to our handler.
388   // The attribute is for the DIE at OFFSET from the beginning of the
389   // .debug_info section. Its name is ATTR, its form is FORM, and its value is
390   // DATA.
ProcessAttributeString(uint64 offset,enum DwarfAttribute attr,enum DwarfForm form,const string & data)391   virtual void ProcessAttributeString(uint64 offset,
392                                       enum DwarfAttribute attr,
393                                       enum DwarfForm form,
394                                       const string& data) { }
395 
396   // Called when we have an attribute whose value is the 64-bit signature
397   // of a type unit in the .debug_types section. OFFSET is the offset of
398   // the DIE whose attribute we're reporting. ATTR and FORM are the
399   // attribute's name and form. SIGNATURE is the type unit's signature.
ProcessAttributeSignature(uint64 offset,enum DwarfAttribute attr,enum DwarfForm form,uint64 signature)400   virtual void ProcessAttributeSignature(uint64 offset,
401                                          enum DwarfAttribute attr,
402                                          enum DwarfForm form,
403                                          uint64 signature) { }
404 
405   // Called when finished processing the DIE at OFFSET.
406   // Because DWARF2/3 specifies a tree of DIEs, you may get starts
407   // before ends of the previous DIE, as we process children before
408   // ending the parent.
EndDIE(uint64 offset)409   virtual void EndDIE(uint64 offset) { }
410 
411 };
412 
413 // This class is a reader for DWARF's Call Frame Information.  CFI
414 // describes how to unwind stack frames --- even for functions that do
415 // not follow fixed conventions for saving registers, whose frame size
416 // varies as they execute, etc.
417 //
418 // CFI describes, at each machine instruction, how to compute the
419 // stack frame's base address, how to find the return address, and
420 // where to find the saved values of the caller's registers (if the
421 // callee has stashed them somewhere to free up the registers for its
422 // own use).
423 //
424 // For example, suppose we have a function whose machine code looks
425 // like this (imagine an assembly language that looks like C, for a
426 // machine with 32-bit registers, and a stack that grows towards lower
427 // addresses):
428 //
429 // func:                                ; entry point; return address at sp
430 // func+0:      sp = sp - 16            ; allocate space for stack frame
431 // func+1:      sp[12] = r0             ; save r0 at sp+12
432 // ...                                  ; other code, not frame-related
433 // func+10:     sp -= 4; *sp = x        ; push some x on the stack
434 // ...                                  ; other code, not frame-related
435 // func+20:     r0 = sp[16]             ; restore saved r0
436 // func+21:     sp += 20                ; pop whole stack frame
437 // func+22:     pc = *sp; sp += 4       ; pop return address and jump to it
438 //
439 // DWARF CFI is (a very compressed representation of) a table with a
440 // row for each machine instruction address and a column for each
441 // register showing how to restore it, if possible.
442 //
443 // A special column named "CFA", for "Canonical Frame Address", tells how
444 // to compute the base address of the frame; registers' entries may
445 // refer to the CFA in describing where the registers are saved.
446 //
447 // Another special column, named "RA", represents the return address.
448 //
449 // For example, here is a complete (uncompressed) table describing the
450 // function above:
451 //
452 //     insn      cfa    r0      r1 ...  ra
453 //     =======================================
454 //     func+0:   sp                     cfa[0]
455 //     func+1:   sp+16                  cfa[0]
456 //     func+2:   sp+16  cfa[-4]         cfa[0]
457 //     func+11:  sp+20  cfa[-4]         cfa[0]
458 //     func+21:  sp+20                  cfa[0]
459 //     func+22:  sp                     cfa[0]
460 //
461 // Some things to note here:
462 //
463 // - Each row describes the state of affairs *before* executing the
464 //   instruction at the given address.  Thus, the row for func+0
465 //   describes the state before we allocate the stack frame.  In the
466 //   next row, the formula for computing the CFA has changed,
467 //   reflecting that allocation.
468 //
469 // - The other entries are written in terms of the CFA; this allows
470 //   them to remain unchanged as the stack pointer gets bumped around.
471 //   For example, the rule for recovering the return address (the "ra"
472 //   column) remains unchanged throughout the function, even as the
473 //   stack pointer takes on three different offsets from the return
474 //   address.
475 //
476 // - Although we haven't shown it, most calling conventions designate
477 //   "callee-saves" and "caller-saves" registers. The callee must
478 //   preserve the values of callee-saves registers; if it uses them,
479 //   it must save their original values somewhere, and restore them
480 //   before it returns. In contrast, the callee is free to trash
481 //   caller-saves registers; if the callee uses these, it will
482 //   probably not bother to save them anywhere, and the CFI will
483 //   probably mark their values as "unrecoverable".
484 //
485 //   (However, since the caller cannot assume the callee was going to
486 //   save them, caller-saves registers are probably dead in the caller
487 //   anyway, so compilers usually don't generate CFA for caller-saves
488 //   registers.)
489 //
490 // - Exactly where the CFA points is a matter of convention that
491 //   depends on the architecture and ABI in use. In the example, the
492 //   CFA is the value the stack pointer had upon entry to the
493 //   function, pointing at the saved return address. But on the x86,
494 //   the call frame information generated by GCC follows the
495 //   convention that the CFA is the address *after* the saved return
496 //   address.
497 //
498 //   But by definition, the CFA remains constant throughout the
499 //   lifetime of the frame. This makes it a useful value for other
500 //   columns to refer to. It is also gives debuggers a useful handle
501 //   for identifying a frame.
502 //
503 // If you look at the table above, you'll notice that a given entry is
504 // often the same as the one immediately above it: most instructions
505 // change only one or two aspects of the stack frame, if they affect
506 // it at all. The DWARF format takes advantage of this fact, and
507 // reduces the size of the data by mentioning only the addresses and
508 // columns at which changes take place. So for the above, DWARF CFI
509 // data would only actually mention the following:
510 //
511 //     insn      cfa    r0      r1 ...  ra
512 //     =======================================
513 //     func+0:   sp                     cfa[0]
514 //     func+1:   sp+16
515 //     func+2:          cfa[-4]
516 //     func+11:  sp+20
517 //     func+21:         r0
518 //     func+22:  sp
519 //
520 // In fact, this is the way the parser reports CFI to the consumer: as
521 // a series of statements of the form, "At address X, column Y changed
522 // to Z," and related conventions for describing the initial state.
523 //
524 // Naturally, it would be impractical to have to scan the entire
525 // program's CFI, noting changes as we go, just to recover the
526 // unwinding rules in effect at one particular instruction. To avoid
527 // this, CFI data is grouped into "entries", each of which covers a
528 // specified range of addresses and begins with a complete statement
529 // of the rules for all recoverable registers at that starting
530 // address. Each entry typically covers a single function.
531 //
532 // Thus, to compute the contents of a given row of the table --- that
533 // is, rules for recovering the CFA, RA, and registers at a given
534 // instruction --- the consumer should find the entry that covers that
535 // instruction's address, start with the initial state supplied at the
536 // beginning of the entry, and work forward until it has processed all
537 // the changes up to and including those for the present instruction.
538 //
539 // There are seven kinds of rules that can appear in an entry of the
540 // table:
541 //
542 // - "undefined": The given register is not preserved by the callee;
543 //   its value cannot be recovered.
544 //
545 // - "same value": This register has the same value it did in the callee.
546 //
547 // - offset(N): The register is saved at offset N from the CFA.
548 //
549 // - val_offset(N): The value the register had in the caller is the
550 //   CFA plus offset N. (This is usually only useful for describing
551 //   the stack pointer.)
552 //
553 // - register(R): The register's value was saved in another register R.
554 //
555 // - expression(E): Evaluating the DWARF expression E using the
556 //   current frame's registers' values yields the address at which the
557 //   register was saved.
558 //
559 // - val_expression(E): Evaluating the DWARF expression E using the
560 //   current frame's registers' values yields the value the register
561 //   had in the caller.
562 
563 class CallFrameInfo {
564  public:
565   // The different kinds of entries one finds in CFI. Used internally,
566   // and for error reporting.
567   enum EntryKind { kUnknown, kCIE, kFDE, kTerminator };
568 
569   // The handler class to which the parser hands the parsed call frame
570   // information.  Defined below.
571   class Handler;
572 
573   // A reporter class, which CallFrameInfo uses to report errors
574   // encountered while parsing call frame information.  Defined below.
575   class Reporter;
576 
577   // Create a DWARF CFI parser. BUFFER points to the contents of the
578   // .debug_frame section to parse; BUFFER_LENGTH is its length in bytes.
579   // REPORTER is an error reporter the parser should use to report
580   // problems. READER is a ByteReader instance that has the endianness and
581   // address size set properly. Report the data we find to HANDLER.
582   //
583   // This class can also parse Linux C++ exception handling data, as found
584   // in '.eh_frame' sections. This data is a variant of DWARF CFI that is
585   // placed in loadable segments so that it is present in the program's
586   // address space, and is interpreted by the C++ runtime to search the
587   // call stack for a handler interested in the exception being thrown,
588   // actually pop the frames, and find cleanup code to run.
589   //
590   // There are two differences between the call frame information described
591   // in the DWARF standard and the exception handling data Linux places in
592   // the .eh_frame section:
593   //
594   // - Exception handling data uses uses a different format for call frame
595   //   information entry headers. The distinguished CIE id, the way FDEs
596   //   refer to their CIEs, and the way the end of the series of entries is
597   //   determined are all slightly different.
598   //
599   //   If the constructor's EH_FRAME argument is true, then the
600   //   CallFrameInfo parses the entry headers as Linux C++ exception
601   //   handling data. If EH_FRAME is false or omitted, the CallFrameInfo
602   //   parses standard DWARF call frame information.
603   //
604   // - Linux C++ exception handling data uses CIE augmentation strings
605   //   beginning with 'z' to specify the presence of additional data after
606   //   the CIE and FDE headers and special encodings used for addresses in
607   //   frame description entries.
608   //
609   //   CallFrameInfo can handle 'z' augmentations in either DWARF CFI or
610   //   exception handling data if you have supplied READER with the base
611   //   addresses needed to interpret the pointer encodings that 'z'
612   //   augmentations can specify. See the ByteReader interface for details
613   //   about the base addresses. See the CallFrameInfo::Handler interface
614   //   for details about the additional information one might find in
615   //   'z'-augmented data.
616   //
617   // Thus:
618   //
619   // - If you are parsing standard DWARF CFI, as found in a .debug_frame
620   //   section, you should pass false for the EH_FRAME argument, or omit
621   //   it, and you need not worry about providing READER with the
622   //   additional base addresses.
623   //
624   // - If you want to parse Linux C++ exception handling data from a
625   //   .eh_frame section, you should pass EH_FRAME as true, and call
626   //   READER's Set*Base member functions before calling our Start method.
627   //
628   // - If you want to parse DWARF CFI that uses the 'z' augmentations
629   //   (although I don't think any toolchain ever emits such data), you
630   //   could pass false for EH_FRAME, but call READER's Set*Base members.
631   //
632   // The extensions the Linux C++ ABI makes to DWARF for exception
633   // handling are described here, rather poorly:
634   // http://refspecs.linux-foundation.org/LSB_4.0.0/LSB-Core-generic/LSB-Core-generic/dwarfext.html
635   // http://refspecs.linux-foundation.org/LSB_4.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html
636   //
637   // The mechanics of C++ exception handling, personality routines,
638   // and language-specific data areas are described here, rather nicely:
639   // http://www.codesourcery.com/public/cxx-abi/abi-eh.html
640   CallFrameInfo(const char *buffer, size_t buffer_length,
641                 ByteReader *reader, Handler *handler, Reporter *reporter,
642                 bool eh_frame = false)
buffer_(buffer)643       : buffer_(buffer), buffer_length_(buffer_length),
644         reader_(reader), handler_(handler), reporter_(reporter),
645         eh_frame_(eh_frame) { }
646 
~CallFrameInfo()647   ~CallFrameInfo() { }
648 
649   // Parse the entries in BUFFER, reporting what we find to HANDLER.
650   // Return true if we reach the end of the section successfully, or
651   // false if we encounter an error.
652   bool Start();
653 
654   // Return the textual name of KIND. For error reporting.
655   static const char *KindName(EntryKind kind);
656 
657  private:
658 
659   struct CIE;
660 
661   // A CFI entry, either an FDE or a CIE.
662   struct Entry {
663     // The starting offset of the entry in the section, for error
664     // reporting.
665     size_t offset;
666 
667     // The start of this entry in the buffer.
668     const char *start;
669 
670     // Which kind of entry this is.
671     //
672     // We want to be able to use this for error reporting even while we're
673     // in the midst of parsing. Error reporting code may assume that kind,
674     // offset, and start fields are valid, although kind may be kUnknown.
675     EntryKind kind;
676 
677     // The end of this entry's common prologue (initial length and id), and
678     // the start of this entry's kind-specific fields.
679     const char *fields;
680 
681     // The start of this entry's instructions.
682     const char *instructions;
683 
684     // The address past the entry's last byte in the buffer. (Note that
685     // since offset points to the entry's initial length field, and the
686     // length field is the number of bytes after that field, this is not
687     // simply buffer_ + offset + length.)
688     const char *end;
689 
690     // For both DWARF CFI and .eh_frame sections, this is the CIE id in a
691     // CIE, and the offset of the associated CIE in an FDE.
692     uint64 id;
693 
694     // The CIE that applies to this entry, if we've parsed it. If this is a
695     // CIE, then this field points to this structure.
696     CIE *cie;
697   };
698 
699   // A common information entry (CIE).
700   struct CIE: public Entry {
701     uint8 version;                      // CFI data version number
702     string augmentation;                // vendor format extension markers
703     uint64 code_alignment_factor;       // scale for code address adjustments
704     int data_alignment_factor;          // scale for stack pointer adjustments
705     unsigned return_address_register;   // which register holds the return addr
706 
707     // True if this CIE includes Linux C++ ABI 'z' augmentation data.
708     bool has_z_augmentation;
709 
710     // Parsed 'z' augmentation data. These are meaningful only if
711     // has_z_augmentation is true.
712     bool has_z_lsda;                    // The 'z' augmentation included 'L'.
713     bool has_z_personality;             // The 'z' augmentation included 'P'.
714     bool has_z_signal_frame;            // The 'z' augmentation included 'S'.
715 
716     // If has_z_lsda is true, this is the encoding to be used for language-
717     // specific data area pointers in FDEs.
718     DwarfPointerEncoding lsda_encoding;
719 
720     // If has_z_personality is true, this is the encoding used for the
721     // personality routine pointer in the augmentation data.
722     DwarfPointerEncoding personality_encoding;
723 
724     // If has_z_personality is true, this is the address of the personality
725     // routine --- or, if personality_encoding & DW_EH_PE_indirect, the
726     // address where the personality routine's address is stored.
727     uint64 personality_address;
728 
729     // This is the encoding used for addresses in the FDE header and
730     // in DW_CFA_set_loc instructions. This is always valid, whether
731     // or not we saw a 'z' augmentation string; its default value is
732     // DW_EH_PE_absptr, which is what normal DWARF CFI uses.
733     DwarfPointerEncoding pointer_encoding;
734   };
735 
736   // A frame description entry (FDE).
737   struct FDE: public Entry {
738     uint64 address;                     // start address of described code
739     uint64 size;                        // size of described code, in bytes
740 
741     // If cie->has_z_lsda is true, then this is the language-specific data
742     // area's address --- or its address's address, if cie->lsda_encoding
743     // has the DW_EH_PE_indirect bit set.
744     uint64 lsda_address;
745   };
746 
747   // Internal use.
748   class Rule;
749   class UndefinedRule;
750   class SameValueRule;
751   class OffsetRule;
752   class ValOffsetRule;
753   class RegisterRule;
754   class ExpressionRule;
755   class ValExpressionRule;
756   class RuleMap;
757   class State;
758 
759   // Parse the initial length and id of a CFI entry, either a CIE, an FDE,
760   // or a .eh_frame end-of-data mark. CURSOR points to the beginning of the
761   // data to parse. On success, populate ENTRY as appropriate, and return
762   // true. On failure, report the problem, and return false. Even if we
763   // return false, set ENTRY->end to the first byte after the entry if we
764   // were able to figure that out, or NULL if we weren't.
765   bool ReadEntryPrologue(const char *cursor, Entry *entry);
766 
767   // Parse the fields of a CIE after the entry prologue, including any 'z'
768   // augmentation data. Assume that the 'Entry' fields of CIE are
769   // populated; use CIE->fields and CIE->end as the start and limit for
770   // parsing. On success, populate the rest of *CIE, and return true; on
771   // failure, report the problem and return false.
772   bool ReadCIEFields(CIE *cie);
773 
774   // Parse the fields of an FDE after the entry prologue, including any 'z'
775   // augmentation data. Assume that the 'Entry' fields of *FDE are
776   // initialized; use FDE->fields and FDE->end as the start and limit for
777   // parsing. Assume that FDE->cie is fully initialized. On success,
778   // populate the rest of *FDE, and return true; on failure, report the
779   // problem and return false.
780   bool ReadFDEFields(FDE *fde);
781 
782   // Report that ENTRY is incomplete, and return false. This is just a
783   // trivial wrapper for invoking reporter_->Incomplete; it provides a
784   // little brevity.
785   bool ReportIncomplete(Entry *entry);
786 
787   // Return true if ENCODING has the DW_EH_PE_indirect bit set.
IsIndirectEncoding(DwarfPointerEncoding encoding)788   static bool IsIndirectEncoding(DwarfPointerEncoding encoding) {
789     return encoding & DW_EH_PE_indirect;
790   }
791 
792   // The contents of the DWARF .debug_info section we're parsing.
793   const char *buffer_;
794   size_t buffer_length_;
795 
796   // For reading multi-byte values with the appropriate endianness.
797   ByteReader *reader_;
798 
799   // The handler to which we should report the data we find.
800   Handler *handler_;
801 
802   // For reporting problems in the info we're parsing.
803   Reporter *reporter_;
804 
805   // True if we are processing .eh_frame-format data.
806   bool eh_frame_;
807 };
808 
809 // The handler class for CallFrameInfo.  The a CFI parser calls the
810 // member functions of a handler object to report the data it finds.
811 class CallFrameInfo::Handler {
812  public:
813   // The pseudo-register number for the canonical frame address.
814   enum { kCFARegister = -1 };
815 
Handler()816   Handler() { }
~Handler()817   virtual ~Handler() { }
818 
819   // The parser has found CFI for the machine code at ADDRESS,
820   // extending for LENGTH bytes. OFFSET is the offset of the frame
821   // description entry in the section, for use in error messages.
822   // VERSION is the version number of the CFI format. AUGMENTATION is
823   // a string describing any producer-specific extensions present in
824   // the data. RETURN_ADDRESS is the number of the register that holds
825   // the address to which the function should return.
826   //
827   // Entry should return true to process this CFI, or false to skip to
828   // the next entry.
829   //
830   // The parser invokes Entry for each Frame Description Entry (FDE)
831   // it finds.  The parser doesn't report Common Information Entries
832   // to the handler explicitly; instead, if the handler elects to
833   // process a given FDE, the parser reiterates the appropriate CIE's
834   // contents at the beginning of the FDE's rules.
835   virtual bool Entry(size_t offset, uint64 address, uint64 length,
836                      uint8 version, const string &augmentation,
837                      unsigned return_address) = 0;
838 
839   // When the Entry function returns true, the parser calls these
840   // handler functions repeatedly to describe the rules for recovering
841   // registers at each instruction in the given range of machine code.
842   // Immediately after a call to Entry, the handler should assume that
843   // the rule for each callee-saves register is "unchanged" --- that
844   // is, that the register still has the value it had in the caller.
845   //
846   // If a *Rule function returns true, we continue processing this entry's
847   // instructions. If a *Rule function returns false, we stop evaluating
848   // instructions, and skip to the next entry. Either way, we call End
849   // before going on to the next entry.
850   //
851   // In all of these functions, if the REG parameter is kCFARegister, then
852   // the rule describes how to find the canonical frame address.
853   // kCFARegister may be passed as a BASE_REGISTER argument, meaning that
854   // the canonical frame address should be used as the base address for the
855   // computation. All other REG values will be positive.
856 
857   // At ADDRESS, register REG's value is not recoverable.
858   virtual bool UndefinedRule(uint64 address, int reg) = 0;
859 
860   // At ADDRESS, register REG's value is the same as that it had in
861   // the caller.
862   virtual bool SameValueRule(uint64 address, int reg) = 0;
863 
864   // At ADDRESS, register REG has been saved at offset OFFSET from
865   // BASE_REGISTER.
866   virtual bool OffsetRule(uint64 address, int reg,
867                           int base_register, long offset) = 0;
868 
869   // At ADDRESS, the caller's value of register REG is the current
870   // value of BASE_REGISTER plus OFFSET. (This rule doesn't provide an
871   // address at which the register's value is saved.)
872   virtual bool ValOffsetRule(uint64 address, int reg,
873                              int base_register, long offset) = 0;
874 
875   // At ADDRESS, register REG has been saved in BASE_REGISTER. This differs
876   // from ValOffsetRule(ADDRESS, REG, BASE_REGISTER, 0), in that
877   // BASE_REGISTER is the "home" for REG's saved value: if you want to
878   // assign to a variable whose home is REG in the calling frame, you
879   // should put the value in BASE_REGISTER.
880   virtual bool RegisterRule(uint64 address, int reg, int base_register) = 0;
881 
882   // At ADDRESS, the DWARF expression EXPRESSION yields the address at
883   // which REG was saved.
884   virtual bool ExpressionRule(uint64 address, int reg,
885                               const string &expression) = 0;
886 
887   // At ADDRESS, the DWARF expression EXPRESSION yields the caller's
888   // value for REG. (This rule doesn't provide an address at which the
889   // register's value is saved.)
890   virtual bool ValExpressionRule(uint64 address, int reg,
891                                  const string &expression) = 0;
892 
893   // Indicate that the rules for the address range reported by the
894   // last call to Entry are complete.  End should return true if
895   // everything is okay, or false if an error has occurred and parsing
896   // should stop.
897   virtual bool End() = 0;
898 
899   // Handler functions for Linux C++ exception handling data. These are
900   // only called if the data includes 'z' augmentation strings.
901 
902   // The Linux C++ ABI uses an extension of the DWARF CFI format to
903   // walk the stack to propagate exceptions from the throw to the
904   // appropriate catch, and do the appropriate cleanups along the way.
905   // CFI entries used for exception handling have two additional data
906   // associated with them:
907   //
908   // - The "language-specific data area" describes which exception
909   //   types the function has 'catch' clauses for, and indicates how
910   //   to go about re-entering the function at the appropriate catch
911   //   clause. If the exception is not caught, it describes the
912   //   destructors that must run before the frame is popped.
913   //
914   // - The "personality routine" is responsible for interpreting the
915   //   language-specific data area's contents, and deciding whether
916   //   the exception should continue to propagate down the stack,
917   //   perhaps after doing some cleanup for this frame, or whether the
918   //   exception will be caught here.
919   //
920   // In principle, the language-specific data area is opaque to
921   // everybody but the personality routine. In practice, these values
922   // may be useful or interesting to readers with extra context, and
923   // we have to at least skip them anyway, so we might as well report
924   // them to the handler.
925 
926   // This entry's exception handling personality routine's address is
927   // ADDRESS. If INDIRECT is true, then ADDRESS is the address at
928   // which the routine's address is stored. The default definition for
929   // this handler function simply returns true, allowing parsing of
930   // the entry to continue.
PersonalityRoutine(uint64 address,bool indirect)931   virtual bool PersonalityRoutine(uint64 address, bool indirect) {
932     return true;
933   }
934 
935   // This entry's language-specific data area (LSDA) is located at
936   // ADDRESS. If INDIRECT is true, then ADDRESS is the address at
937   // which the area's address is stored. The default definition for
938   // this handler function simply returns true, allowing parsing of
939   // the entry to continue.
LanguageSpecificDataArea(uint64 address,bool indirect)940   virtual bool LanguageSpecificDataArea(uint64 address, bool indirect) {
941     return true;
942   }
943 
944   // This entry describes a signal trampoline --- this frame is the
945   // caller of a signal handler. The default definition for this
946   // handler function simply returns true, allowing parsing of the
947   // entry to continue.
948   //
949   // The best description of the rationale for and meaning of signal
950   // trampoline CFI entries seems to be in the GCC bug database:
951   // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26208
SignalHandler()952   virtual bool SignalHandler() { return true; }
953 };
954 
955 // The CallFrameInfo class makes calls on an instance of this class to
956 // report errors or warn about problems in the data it is parsing. The
957 // default definitions of these methods print a message to stderr, but
958 // you can make a derived class that overrides them.
959 class CallFrameInfo::Reporter {
960  public:
961   // Create an error reporter which attributes troubles to the section
962   // named SECTION in FILENAME.
963   //
964   // Normally SECTION would be .debug_frame, but the Mac puts CFI data
965   // in a Mach-O section named __debug_frame. If we support
966   // Linux-style exception handling data, we could be reading an
967   // .eh_frame section.
968   Reporter(const string &filename,
969            const string &section = ".debug_frame")
filename_(filename)970       : filename_(filename), section_(section) { }
~Reporter()971   virtual ~Reporter() { }
972 
973   // The CFI entry at OFFSET ends too early to be well-formed. KIND
974   // indicates what kind of entry it is; KIND can be kUnknown if we
975   // haven't parsed enough of the entry to tell yet.
976   virtual void Incomplete(uint64 offset, CallFrameInfo::EntryKind kind);
977 
978   // The .eh_frame data has a four-byte zero at OFFSET where the next
979   // entry's length would be; this is a terminator. However, the buffer
980   // length as given to the CallFrameInfo constructor says there should be
981   // more data.
982   virtual void EarlyEHTerminator(uint64 offset);
983 
984   // The FDE at OFFSET refers to the CIE at CIE_OFFSET, but the
985   // section is not that large.
986   virtual void CIEPointerOutOfRange(uint64 offset, uint64 cie_offset);
987 
988   // The FDE at OFFSET refers to the CIE at CIE_OFFSET, but the entry
989   // there is not a CIE.
990   virtual void BadCIEId(uint64 offset, uint64 cie_offset);
991 
992   // The FDE at OFFSET refers to a CIE with version number VERSION,
993   // which we don't recognize. We cannot parse DWARF CFI if it uses
994   // a version number we don't recognize.
995   virtual void UnrecognizedVersion(uint64 offset, int version);
996 
997   // The FDE at OFFSET refers to a CIE with augmentation AUGMENTATION,
998   // which we don't recognize. We cannot parse DWARF CFI if it uses
999   // augmentations we don't recognize.
1000   virtual void UnrecognizedAugmentation(uint64 offset,
1001                                         const string &augmentation);
1002 
1003   // The pointer encoding ENCODING, specified by the CIE at OFFSET, is not
1004   // a valid encoding.
1005   virtual void InvalidPointerEncoding(uint64 offset, uint8 encoding);
1006 
1007   // The pointer encoding ENCODING, specified by the CIE at OFFSET, depends
1008   // on a base address which has not been supplied.
1009   virtual void UnusablePointerEncoding(uint64 offset, uint8 encoding);
1010 
1011   // The CIE at OFFSET contains a DW_CFA_restore instruction at
1012   // INSN_OFFSET, which may not appear in a CIE.
1013   virtual void RestoreInCIE(uint64 offset, uint64 insn_offset);
1014 
1015   // The entry at OFFSET, of kind KIND, has an unrecognized
1016   // instruction at INSN_OFFSET.
1017   virtual void BadInstruction(uint64 offset, CallFrameInfo::EntryKind kind,
1018                               uint64 insn_offset);
1019 
1020   // The instruction at INSN_OFFSET in the entry at OFFSET, of kind
1021   // KIND, establishes a rule that cites the CFA, but we have not
1022   // established a CFA rule yet.
1023   virtual void NoCFARule(uint64 offset, CallFrameInfo::EntryKind kind,
1024                          uint64 insn_offset);
1025 
1026   // The instruction at INSN_OFFSET in the entry at OFFSET, of kind
1027   // KIND, is a DW_CFA_restore_state instruction, but the stack of
1028   // saved states is empty.
1029   virtual void EmptyStateStack(uint64 offset, CallFrameInfo::EntryKind kind,
1030                                uint64 insn_offset);
1031 
1032   // The DW_CFA_remember_state instruction at INSN_OFFSET in the entry
1033   // at OFFSET, of kind KIND, would restore a state that has no CFA
1034   // rule, whereas the current state does have a CFA rule. This is
1035   // bogus input, which the CallFrameInfo::Handler interface doesn't
1036   // (and shouldn't) have any way to report.
1037   virtual void ClearingCFARule(uint64 offset, CallFrameInfo::EntryKind kind,
1038                                uint64 insn_offset);
1039 
1040  protected:
1041   // The name of the file whose CFI we're reading.
1042   string filename_;
1043 
1044   // The name of the CFI section in that file.
1045   string section_;
1046 };
1047 
1048 }  // namespace dwarf2reader
1049 
1050 #endif  // UTIL_DEBUGINFO_DWARF2READER_H__
1051