1 // Copyright 2008 Google Inc. All Rights Reserved.
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // error_diag.h: Ambiguous error diagnosis class
16 
17 #ifndef STRESSAPPTEST_ERROR_DIAG_H_
18 #define STRESSAPPTEST_ERROR_DIAG_H_
19 
20 #include <pthread.h>
21 #include <list>
22 #include <map>
23 #include <set>
24 #include <string>
25 
26 // This file must work with autoconf on its public version,
27 // so these includes are correct.
28 #include "sattypes.h"
29 #include "os.h"
30 
31 class ErrorInstance;
32 
33 // This describes the components of the system.
34 class DeviceTree {
35  public:
36   explicit DeviceTree(string name);
37   ~DeviceTree();
38 
39   // Atomically find arbitrary device in subtree.
40   DeviceTree *FindInSubTree(string name);
41   // Find or add named device.
42   DeviceTree *FindOrAddDevice(string name);
43   // Atomically add sub device.
44   void InsertSubDevice(string name);
45   // Returns parent device.
GetParent()46   DeviceTree *GetParent() { return parent_; }
47   // Pretty prints device tree.
48   void PrettyPrint(string spacer = " ");
49   // Atomically add error instance to device.
50   void AddErrorInstance(ErrorInstance *error_instance);
51   // Returns true of device is known to be bad.
52   bool KnownBad();
53   // Returns number of direct sub devices.
NumDirectSubDevices()54   int NumDirectSubDevices() { return subdevices_.size(); }
55 
56  private:
57   // Unlocked version of FindInSubTree.
58   DeviceTree *UnlockedFindInSubTree(string name);
59 
60   std::map<string, DeviceTree*> subdevices_;    // Map of sub-devices.
61   std::list<ErrorInstance*> errors_;            // Log of errors.
62   DeviceTree *parent_;                          // Pointer to parent device.
63   string name_;                                 // Device name.
64   pthread_mutex_t device_tree_mutex_;           // Mutex protecting device tree.
65 };
66 
67 
68 // enum type for collected errors.
69 enum SATErrorType {
70   SAT_ERROR_NONE = 0,
71   SAT_ERROR_ECC,
72   SAT_ERROR_MISCOMPARE,
73   SAT_ERROR_SECTOR_TAG,
74 };
75 
76 // enum type for error severity.
77 enum SATErrorSeverity {
78   SAT_ERROR_CORRECTABLE = 0,
79   SAT_ERROR_FATAL,
80 };
81 
82 // This describes an error and it's likely causes.
83 class ErrorInstance {
84  public:
ErrorInstance()85   ErrorInstance(): type_(SAT_ERROR_NONE), severity_(SAT_ERROR_CORRECTABLE) {}
86 
87   SATErrorType type_;             // Type of error: ECC, miscompare, sector.
88   SATErrorSeverity severity_;     // Correctable, or fatal.
89   std::set<DeviceTree*> causes_;  // Devices that can cause this type of error.
90 };
91 
92 // This describes ECC errors.
93 class ECCErrorInstance: public ErrorInstance {
94  public:
ECCErrorInstance()95   ECCErrorInstance() { type_ = SAT_ERROR_ECC; }
96 
97   uint64 addr_;               // Address where error occured.
98 };
99 
100 // This describes miscompare errors.
101 class MiscompareErrorInstance: public ErrorInstance {
102  public:
MiscompareErrorInstance()103   MiscompareErrorInstance() { type_ = SAT_ERROR_MISCOMPARE; }
104 
105   uint64 addr_;               // Address where miscompare occured.
106 };
107 
108 // This describes HDD miscompare errors.
109 class HDDMiscompareErrorInstance: public MiscompareErrorInstance {
110  public:
111   uint64 addr2_;             // addr_ and addr2_ are src and dst memory addr.
112   int offset_;               // offset.
113   int block_;                // error block.
114 };
115 
116 // This describes HDD miscompare errors.
117 class HDDSectorTagErrorInstance: public ErrorInstance {
118  public:
HDDSectorTagErrorInstance()119   HDDSectorTagErrorInstance() { type_ = SAT_ERROR_SECTOR_TAG; }
120 
121   uint64 addr_;
122   uint64 addr2_;             // addr_ and addr2_ are src and dst memory addr.
123   int sector_;               // error sector.
124   int block_;                // error block.
125 };
126 
127 // Generic error storage and sorting class.
128 class ErrorDiag {
129  public:
130   ErrorDiag();
131   virtual ~ErrorDiag();
132 
133   // Add info about a CECC.
134   virtual int AddCeccError(string dimm_string);
135 
136   // Add info about a UECC.
137   virtual int AddUeccError(string dimm_string);
138 
139   // Add info about a miscompare.
140   virtual int AddMiscompareError(string dimm_string, uint64 addr, int count);
141 
142   // Add info about a miscompare from a drive.
143   virtual int AddHDDMiscompareError(string devicename, int block, int offset,
144                             void *src_addr, void *dst_addr);
145 
146   // Add info about a sector tag miscompare from a drive.
147   virtual int AddHDDSectorTagError(string devicename, int block, int offset,
148                            int sector, void *src_addr, void *dst_addr);
149 
150   // Set platform specific handle and initialize device tree.
151   bool set_os(OsLayer *os);
152 
153  protected:
154   // Create and initialize system device tree.
155   virtual bool InitializeDeviceTree();
156 
157   // Utility Function to translate a virtual address to DIMM number.
158   string AddressToDimmString(OsLayer *os, void *addr, int offset);
159 
160   DeviceTree *system_tree_root_;  // System device tree.
161   OsLayer *os_;                   // Platform handle.
162 
163  private:
164   DISALLOW_COPY_AND_ASSIGN(ErrorDiag);
165 };
166 
167 #endif  // STRESSAPPTEST_ERROR_DIAG_H_
168