1 // Copyright 2008 Google Inc. All Rights Reserved.
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // error_diag.cc: Collects device errors for analysis to more accurately
16 //                pin-point failed component.
17 
18 #include <set>
19 #include <list>
20 #include <map>
21 
22 // This file must work with autoconf on its public version,
23 // so these includes are correct.
24 #include "error_diag.h"
25 #include "sattypes.h"
26 
27 
28 // DeviceTree constructor.
DeviceTree(string name)29 DeviceTree::DeviceTree(string name)
30   : parent_(0), name_(name) {
31   pthread_mutex_init(&device_tree_mutex_, NULL);
32 }
33 
34 // DeviceTree destructor.
~DeviceTree()35 DeviceTree::~DeviceTree() {
36   // Deallocate subtree devices.
37   for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
38       itr != subdevices_.end();
39       ++itr) {
40     delete itr->second;
41   }
42   // Deallocate device errors.
43   for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
44       itr != errors_.end();
45       ++itr) {
46     delete (*itr);
47   }
48   pthread_mutex_destroy(&device_tree_mutex_);
49 }
50 
51 // Atomically find named device in sub device tree.
52 // Returns 0 if not found
FindInSubTree(string name)53 DeviceTree *DeviceTree::FindInSubTree(string name) {
54   DeviceTree *ret;
55   pthread_mutex_lock(&device_tree_mutex_);
56   ret = UnlockedFindInSubTree(name);
57   pthread_mutex_unlock(&device_tree_mutex_);
58   return ret;
59 }
60 
61 // Find named device in sub device tree (Non-atomic).
62 // Returns 0 if not found
UnlockedFindInSubTree(string name)63 DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
64   std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
65   if (itr != subdevices_.end()) {
66     return itr->second;
67   } else {
68     // Search sub-tree.
69     for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
70         itr != subdevices_.end();
71         ++itr) {
72       DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
73       if (result != 0)
74         return result;
75     }
76     return 0;
77   }
78 }
79 
80 // Atomically add error instance to device.
AddErrorInstance(ErrorInstance * error_instance)81 void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
82   pthread_mutex_lock(&device_tree_mutex_);
83   errors_.push_back(error_instance);
84   pthread_mutex_unlock(&device_tree_mutex_);
85 }
86 
87 // Find or add queried device as necessary.
FindOrAddDevice(string name)88 DeviceTree *DeviceTree::FindOrAddDevice(string name) {
89   // Assume named device does not exist and try to insert the device anyway.
90   // No-op if named device already exists.
91   InsertSubDevice(name);
92   // Find and return sub device pointer.
93   return FindInSubTree(name);
94 }
95 
96 // Pretty prints device tree.
PrettyPrint(string spacer)97 void DeviceTree::PrettyPrint(string spacer) {
98   for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
99       itr != subdevices_.end();
100       ++itr) {
101     printf("%s%s\n", spacer.c_str(), itr->first.c_str());
102     itr->second->PrettyPrint(spacer+spacer);
103   }
104 }
105 
106 // Atomically add sub device.
107 // No-op if named device already exists.
InsertSubDevice(string name)108 void DeviceTree::InsertSubDevice(string name) {
109   pthread_mutex_lock(&device_tree_mutex_);
110   if (UnlockedFindInSubTree(name) != 0) {
111     pthread_mutex_unlock(&device_tree_mutex_);
112     return;
113   }
114   subdevices_[name] = new DeviceTree(name);
115   subdevices_[name]->parent_ = this;
116   pthread_mutex_unlock(&device_tree_mutex_);
117 }
118 
119 
120 // Returns true of any error associated with this device is fatal.
KnownBad()121 bool DeviceTree::KnownBad() {
122   pthread_mutex_lock(&device_tree_mutex_);
123   for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
124       itr != errors_.end();
125       ++itr) {
126     if ((*itr)->severity_ == SAT_ERROR_FATAL) {
127       pthread_mutex_unlock(&device_tree_mutex_);
128       return true;
129     }
130   }
131   pthread_mutex_unlock(&device_tree_mutex_);
132   return false;
133 }
134 
135 
136 // ErrorDiag constructor.
ErrorDiag()137 ErrorDiag::ErrorDiag() {
138   os_ = 0;
139   system_tree_root_ = 0;
140 }
141 
142 // ErrorDiag destructor.
~ErrorDiag()143 ErrorDiag::~ErrorDiag() {
144   if (system_tree_root_)
145     delete system_tree_root_;
146 }
147 
148 // Set platform specific handle and initialize device tree.
149 // Returns false on error. true otherwise.
set_os(OsLayer * os)150 bool ErrorDiag::set_os(OsLayer *os) {
151   os_ = os;
152   return(InitializeDeviceTree());
153 }
154 
155 // Create and initialize system device tree.
156 // Returns false on error. true otherwise.
InitializeDeviceTree()157 bool ErrorDiag::InitializeDeviceTree() {
158   system_tree_root_ = new DeviceTree("system_root");
159   if (!system_tree_root_)
160     return false;
161   return true;
162 }
163 
164 // Logs info about a CECC.
165 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddCeccError(string dimm_string)166 int ErrorDiag::AddCeccError(string dimm_string) {
167   DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
168   ECCErrorInstance *error = new ECCErrorInstance;
169   if (!error)
170     return -1;
171   error->severity_ = SAT_ERROR_CORRECTABLE;
172   dimm_device->AddErrorInstance(error);
173   return 0;
174 }
175 
176 // Logs info about a UECC.
177 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddUeccError(string dimm_string)178 int ErrorDiag::AddUeccError(string dimm_string) {
179   DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
180   ECCErrorInstance *error = new ECCErrorInstance;
181   if (!error)
182     return -1;
183   error->severity_ = SAT_ERROR_FATAL;
184   dimm_device->AddErrorInstance(error);
185   return 0;
186 }
187 
188 // Logs info about a miscompare.
189 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddMiscompareError(string dimm_string,uint64 addr,int count)190 int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
191   DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
192   MiscompareErrorInstance *error = new MiscompareErrorInstance;
193   if (!error)
194     return -1;
195   error->severity_ = SAT_ERROR_FATAL;
196   error->addr_ = addr;
197   dimm_device->AddErrorInstance(error);
198   os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
199   return 1;
200 }
201 
202 // Utility Function to translate a virtual address to DIMM number.
203 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddressToDimmString(OsLayer * os,void * addr,int offset)204 string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
205   char dimm_string[256] = "";
206   char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
207   uint64 paddr = os->VirtualToPhysical(vbyteaddr);
208   os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
209   return string(dimm_string);
210 }
211 
212 // Info about a miscompare from a drive.
213 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddHDDMiscompareError(string devicename,int block,int offset,void * src_addr,void * dst_addr)214 int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
215                                      void *src_addr, void *dst_addr) {
216   bool mask_hdd_error = false;
217 
218   HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
219   if (!error)
220     return -1;
221 
222   error->addr_ = reinterpret_cast<uint64>(src_addr);
223   error->addr2_ = reinterpret_cast<uint64>(dst_addr);
224   error->offset_ = offset;
225   error->block_ = block;
226 
227   string src_dimm = AddressToDimmString(os_, src_addr, offset);
228   string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
229 
230   // DIMM name look up success
231   if (src_dimm.compare("DIMM Unknown")) {
232     // Add src DIMM as possible miscompare cause.
233     DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
234     error->causes_.insert(src_dimm_dev);
235     if (src_dimm_dev->KnownBad()) {
236       mask_hdd_error = true;
237       logprintf(5, "Log: supressed %s miscompare report: "
238                 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
239     }
240   }
241   if (dst_dimm.compare("DIMM Unknown")) {
242     // Add dst DIMM as possible miscompare cause.
243     DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
244     error->causes_.insert(dst_dimm_dev);
245     if (dst_dimm_dev->KnownBad()) {
246       mask_hdd_error = true;
247       logprintf(5, "Log: supressed %s miscompare report: "
248                 "known bad destination: %s\n", devicename.c_str(),
249                 dst_dimm.c_str());
250     }
251   }
252 
253   DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
254   hdd_dev->AddErrorInstance(error);
255 
256   // HDD error was not masked by bad DIMMs: report bad HDD.
257   if (!mask_hdd_error) {
258     os_->ErrorReport(devicename.c_str(), "miscompare", 1);
259     error->severity_ = SAT_ERROR_FATAL;
260     return 1;
261   }
262   return 0;
263 }
264 
265 // Info about a sector tag miscompare from a drive.
266 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddHDDSectorTagError(string devicename,int block,int offset,int sector,void * src_addr,void * dst_addr)267 int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
268                                     int sector, void *src_addr,
269                                     void *dst_addr) {
270   bool mask_hdd_error = false;
271 
272   HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
273   if (!error)
274     return -1;
275 
276   error->addr_ = reinterpret_cast<uint64>(src_addr);
277   error->addr2_ = reinterpret_cast<uint64>(dst_addr);
278   error->sector_ = sector;
279   error->block_ = block;
280 
281   string src_dimm = AddressToDimmString(os_, src_addr, offset);
282   string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
283 
284   // DIMM name look up success
285   if (src_dimm.compare("DIMM Unknown")) {
286     // Add src DIMM as possible miscompare cause.
287     DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
288     error->causes_.insert(src_dimm_dev);
289     if (src_dimm_dev->KnownBad()) {
290       mask_hdd_error = true;
291       logprintf(5, "Log: supressed %s sector tag error report: "
292                 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
293     }
294   }
295   if (dst_dimm.compare("DIMM Unknown")) {
296     // Add dst DIMM as possible miscompare cause.
297     DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
298     error->causes_.insert(dst_dimm_dev);
299     if (dst_dimm_dev->KnownBad()) {
300       mask_hdd_error = true;
301       logprintf(5, "Log: supressed %s sector tag error report: "
302                 "known bad destination: %s\n", devicename.c_str(),
303                 dst_dimm.c_str());
304     }
305   }
306 
307   DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
308   hdd_dev->AddErrorInstance(error);
309 
310   // HDD error was not masked by bad DIMMs: report bad HDD.
311   if (!mask_hdd_error) {
312     os_->ErrorReport(devicename.c_str(), "sector", 1);
313     error->severity_ = SAT_ERROR_FATAL;
314     return 1;
315   }
316   return 0;
317 }
318