191037db265ecdd914a26e056cf69207b4f50924ehkuang// Copyright 2008 Google Inc. All Rights Reserved. 291037db265ecdd914a26e056cf69207b4f50924ehkuang 391037db265ecdd914a26e056cf69207b4f50924ehkuang// Licensed under the Apache License, Version 2.0 (the "License"); 491037db265ecdd914a26e056cf69207b4f50924ehkuang// you may not use this file except in compliance with the License. 591037db265ecdd914a26e056cf69207b4f50924ehkuang// You may obtain a copy of the License at 691037db265ecdd914a26e056cf69207b4f50924ehkuang 791037db265ecdd914a26e056cf69207b4f50924ehkuang// http://www.apache.org/licenses/LICENSE-2.0 891037db265ecdd914a26e056cf69207b4f50924ehkuang 991037db265ecdd914a26e056cf69207b4f50924ehkuang// Unless required by applicable law or agreed to in writing, software 1091037db265ecdd914a26e056cf69207b4f50924ehkuang// distributed under the License is distributed on an "AS IS" BASIS, 1191037db265ecdd914a26e056cf69207b4f50924ehkuang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1291037db265ecdd914a26e056cf69207b4f50924ehkuang// See the License for the specific language governing permissions and 133df0563f1b24dac6c0bd122fc922a48211269061hkuang// limitations under the License. 1491037db265ecdd914a26e056cf69207b4f50924ehkuang 1591037db265ecdd914a26e056cf69207b4f50924ehkuang// error_diag.cc: Collects device errors for analysis to more accurately 1691037db265ecdd914a26e056cf69207b4f50924ehkuang// pin-point failed component. 1791037db265ecdd914a26e056cf69207b4f50924ehkuang 1891037db265ecdd914a26e056cf69207b4f50924ehkuang#include <set> 1991037db265ecdd914a26e056cf69207b4f50924ehkuang#include <list> 2091037db265ecdd914a26e056cf69207b4f50924ehkuang#include <map> 2191037db265ecdd914a26e056cf69207b4f50924ehkuang 2291037db265ecdd914a26e056cf69207b4f50924ehkuang// This file must work with autoconf on its public version, 233df0563f1b24dac6c0bd122fc922a48211269061hkuang// so these includes are correct. 2491037db265ecdd914a26e056cf69207b4f50924ehkuang#include "error_diag.h" 2591037db265ecdd914a26e056cf69207b4f50924ehkuang#include "sattypes.h" 2691037db265ecdd914a26e056cf69207b4f50924ehkuang 2791037db265ecdd914a26e056cf69207b4f50924ehkuang 2891037db265ecdd914a26e056cf69207b4f50924ehkuang// DeviceTree constructor. 2991037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree::DeviceTree(string name) 3091037db265ecdd914a26e056cf69207b4f50924ehkuang : parent_(0), name_(name) { 3191037db265ecdd914a26e056cf69207b4f50924ehkuang pthread_mutex_init(&device_tree_mutex_, NULL); 3291037db265ecdd914a26e056cf69207b4f50924ehkuang} 3391037db265ecdd914a26e056cf69207b4f50924ehkuang 3491037db265ecdd914a26e056cf69207b4f50924ehkuang// DeviceTree destructor. 3591037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree::~DeviceTree() { 3691037db265ecdd914a26e056cf69207b4f50924ehkuang // Deallocate subtree devices. 3791037db265ecdd914a26e056cf69207b4f50924ehkuang for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); 3891037db265ecdd914a26e056cf69207b4f50924ehkuang itr != subdevices_.end(); 3991037db265ecdd914a26e056cf69207b4f50924ehkuang ++itr) { 4091037db265ecdd914a26e056cf69207b4f50924ehkuang delete itr->second; 4191037db265ecdd914a26e056cf69207b4f50924ehkuang } 4291037db265ecdd914a26e056cf69207b4f50924ehkuang // Deallocate device errors. 4391037db265ecdd914a26e056cf69207b4f50924ehkuang for (std::list<ErrorInstance*>::iterator itr = errors_.begin(); 4491037db265ecdd914a26e056cf69207b4f50924ehkuang itr != errors_.end(); 4591037db265ecdd914a26e056cf69207b4f50924ehkuang ++itr) { 4691037db265ecdd914a26e056cf69207b4f50924ehkuang delete (*itr); 4791037db265ecdd914a26e056cf69207b4f50924ehkuang } 4891037db265ecdd914a26e056cf69207b4f50924ehkuang pthread_mutex_destroy(&device_tree_mutex_); 4991037db265ecdd914a26e056cf69207b4f50924ehkuang} 5091037db265ecdd914a26e056cf69207b4f50924ehkuang 5191037db265ecdd914a26e056cf69207b4f50924ehkuang// Atomically find named device in sub device tree. 5291037db265ecdd914a26e056cf69207b4f50924ehkuang// Returns 0 if not found 5391037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree *DeviceTree::FindInSubTree(string name) { 5491037db265ecdd914a26e056cf69207b4f50924ehkuang DeviceTree *ret; 5591037db265ecdd914a26e056cf69207b4f50924ehkuang pthread_mutex_lock(&device_tree_mutex_); 5691037db265ecdd914a26e056cf69207b4f50924ehkuang ret = UnlockedFindInSubTree(name); 573df0563f1b24dac6c0bd122fc922a48211269061hkuang pthread_mutex_unlock(&device_tree_mutex_); 5891037db265ecdd914a26e056cf69207b4f50924ehkuang return ret; 5991037db265ecdd914a26e056cf69207b4f50924ehkuang} 6091037db265ecdd914a26e056cf69207b4f50924ehkuang 6191037db265ecdd914a26e056cf69207b4f50924ehkuang// Find named device in sub device tree (Non-atomic). 6291037db265ecdd914a26e056cf69207b4f50924ehkuang// Returns 0 if not found 6391037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree *DeviceTree::UnlockedFindInSubTree(string name) { 6491037db265ecdd914a26e056cf69207b4f50924ehkuang std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name); 6591037db265ecdd914a26e056cf69207b4f50924ehkuang if (itr != subdevices_.end()) { 6691037db265ecdd914a26e056cf69207b4f50924ehkuang return itr->second; 6791037db265ecdd914a26e056cf69207b4f50924ehkuang } else { 6891037db265ecdd914a26e056cf69207b4f50924ehkuang // Search sub-tree. 6991037db265ecdd914a26e056cf69207b4f50924ehkuang for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); 7091037db265ecdd914a26e056cf69207b4f50924ehkuang itr != subdevices_.end(); 7191037db265ecdd914a26e056cf69207b4f50924ehkuang ++itr) { 7291037db265ecdd914a26e056cf69207b4f50924ehkuang DeviceTree *result = itr->second->UnlockedFindInSubTree(name); 7391037db265ecdd914a26e056cf69207b4f50924ehkuang if (result != 0) 7491037db265ecdd914a26e056cf69207b4f50924ehkuang return result; 7591037db265ecdd914a26e056cf69207b4f50924ehkuang } 7691037db265ecdd914a26e056cf69207b4f50924ehkuang return 0; 7791037db265ecdd914a26e056cf69207b4f50924ehkuang } 7891037db265ecdd914a26e056cf69207b4f50924ehkuang} 79 80// Atomically add error instance to device. 81void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) { 82 pthread_mutex_lock(&device_tree_mutex_); 83 errors_.push_back(error_instance); 84 pthread_mutex_unlock(&device_tree_mutex_); 85} 86 87// Find or add queried device as necessary. 88DeviceTree *DeviceTree::FindOrAddDevice(string name) { 89 // Assume named device does not exist and try to insert the device anyway. 90 // No-op if named device already exists. 91 InsertSubDevice(name); 92 // Find and return sub device pointer. 93 return FindInSubTree(name); 94} 95 96// Pretty prints device tree. 97void DeviceTree::PrettyPrint(string spacer) { 98 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); 99 itr != subdevices_.end(); 100 ++itr) { 101 printf("%s%s\n", spacer.c_str(), itr->first.c_str()); 102 itr->second->PrettyPrint(spacer+spacer); 103 } 104} 105 106// Atomically add sub device. 107// No-op if named device already exists. 108void DeviceTree::InsertSubDevice(string name) { 109 pthread_mutex_lock(&device_tree_mutex_); 110 if (UnlockedFindInSubTree(name) != 0) { 111 pthread_mutex_unlock(&device_tree_mutex_); 112 return; 113 } 114 subdevices_[name] = new DeviceTree(name); 115 subdevices_[name]->parent_ = this; 116 pthread_mutex_unlock(&device_tree_mutex_); 117} 118 119 120// Returns true of any error associated with this device is fatal. 121bool DeviceTree::KnownBad() { 122 pthread_mutex_lock(&device_tree_mutex_); 123 for (std::list<ErrorInstance*>::iterator itr = errors_.begin(); 124 itr != errors_.end(); 125 ++itr) { 126 if ((*itr)->severity_ == SAT_ERROR_FATAL) { 127 pthread_mutex_unlock(&device_tree_mutex_); 128 return true; 129 } 130 } 131 pthread_mutex_unlock(&device_tree_mutex_); 132 return false; 133} 134 135 136// ErrorDiag constructor. 137ErrorDiag::ErrorDiag() { 138 os_ = 0; 139 system_tree_root_ = 0; 140} 141 142// ErrorDiag destructor. 143ErrorDiag::~ErrorDiag() { 144 if (system_tree_root_) 145 delete system_tree_root_; 146} 147 148// Set platform specific handle and initialize device tree. 149// Returns false on error. true otherwise. 150bool ErrorDiag::set_os(OsLayer *os) { 151 os_ = os; 152 return(InitializeDeviceTree()); 153} 154 155// Create and initialize system device tree. 156// Returns false on error. true otherwise. 157bool ErrorDiag::InitializeDeviceTree() { 158 system_tree_root_ = new DeviceTree("system_root"); 159 if (!system_tree_root_) 160 return false; 161 return true; 162} 163 164// Logs info about a CECC. 165// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 166int ErrorDiag::AddCeccError(string dimm_string) { 167 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); 168 ECCErrorInstance *error = new ECCErrorInstance; 169 if (!error) 170 return -1; 171 error->severity_ = SAT_ERROR_CORRECTABLE; 172 dimm_device->AddErrorInstance(error); 173 return 0; 174} 175 176// Logs info about a UECC. 177// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 178int ErrorDiag::AddUeccError(string dimm_string) { 179 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); 180 ECCErrorInstance *error = new ECCErrorInstance; 181 if (!error) 182 return -1; 183 error->severity_ = SAT_ERROR_FATAL; 184 dimm_device->AddErrorInstance(error); 185 return 0; 186} 187 188// Logs info about a miscompare. 189// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 190int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) { 191 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); 192 MiscompareErrorInstance *error = new MiscompareErrorInstance; 193 if (!error) 194 return -1; 195 error->severity_ = SAT_ERROR_FATAL; 196 error->addr_ = addr; 197 dimm_device->AddErrorInstance(error); 198 os_->ErrorReport(dimm_string.c_str(), "miscompare", count); 199 return 1; 200} 201 202// Utility Function to translate a virtual address to DIMM number. 203// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 204string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) { 205 char dimm_string[256] = ""; 206 char *vbyteaddr = reinterpret_cast<char*>(addr) + offset; 207 uint64 paddr = os->VirtualToPhysical(vbyteaddr); 208 os->FindDimm(paddr, dimm_string, sizeof(dimm_string)); 209 return string(dimm_string); 210} 211 212// Info about a miscompare from a drive. 213// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 214int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset, 215 void *src_addr, void *dst_addr) { 216 bool mask_hdd_error = false; 217 218 HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance; 219 if (!error) 220 return -1; 221 222 error->addr_ = reinterpret_cast<uint64>(src_addr); 223 error->addr2_ = reinterpret_cast<uint64>(dst_addr); 224 error->offset_ = offset; 225 error->block_ = block; 226 227 string src_dimm = AddressToDimmString(os_, src_addr, offset); 228 string dst_dimm = AddressToDimmString(os_, dst_addr, offset); 229 230 // DIMM name look up success 231 if (src_dimm.compare("DIMM Unknown")) { 232 // Add src DIMM as possible miscompare cause. 233 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm); 234 error->causes_.insert(src_dimm_dev); 235 if (src_dimm_dev->KnownBad()) { 236 mask_hdd_error = true; 237 logprintf(5, "Log: supressed %s miscompare report: " 238 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str()); 239 } 240 } 241 if (dst_dimm.compare("DIMM Unknown")) { 242 // Add dst DIMM as possible miscompare cause. 243 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm); 244 error->causes_.insert(dst_dimm_dev); 245 if (dst_dimm_dev->KnownBad()) { 246 mask_hdd_error = true; 247 logprintf(5, "Log: supressed %s miscompare report: " 248 "known bad destination: %s\n", devicename.c_str(), 249 dst_dimm.c_str()); 250 } 251 } 252 253 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename); 254 hdd_dev->AddErrorInstance(error); 255 256 // HDD error was not masked by bad DIMMs: report bad HDD. 257 if (!mask_hdd_error) { 258 os_->ErrorReport(devicename.c_str(), "miscompare", 1); 259 error->severity_ = SAT_ERROR_FATAL; 260 return 1; 261 } 262 return 0; 263} 264 265// Info about a sector tag miscompare from a drive. 266// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 267int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset, 268 int sector, void *src_addr, 269 void *dst_addr) { 270 bool mask_hdd_error = false; 271 272 HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance; 273 if (!error) 274 return -1; 275 276 error->addr_ = reinterpret_cast<uint64>(src_addr); 277 error->addr2_ = reinterpret_cast<uint64>(dst_addr); 278 error->sector_ = sector; 279 error->block_ = block; 280 281 string src_dimm = AddressToDimmString(os_, src_addr, offset); 282 string dst_dimm = AddressToDimmString(os_, dst_addr, offset); 283 284 // DIMM name look up success 285 if (src_dimm.compare("DIMM Unknown")) { 286 // Add src DIMM as possible miscompare cause. 287 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm); 288 error->causes_.insert(src_dimm_dev); 289 if (src_dimm_dev->KnownBad()) { 290 mask_hdd_error = true; 291 logprintf(5, "Log: supressed %s sector tag error report: " 292 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str()); 293 } 294 } 295 if (dst_dimm.compare("DIMM Unknown")) { 296 // Add dst DIMM as possible miscompare cause. 297 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm); 298 error->causes_.insert(dst_dimm_dev); 299 if (dst_dimm_dev->KnownBad()) { 300 mask_hdd_error = true; 301 logprintf(5, "Log: supressed %s sector tag error report: " 302 "known bad destination: %s\n", devicename.c_str(), 303 dst_dimm.c_str()); 304 } 305 } 306 307 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename); 308 hdd_dev->AddErrorInstance(error); 309 310 // HDD error was not masked by bad DIMMs: report bad HDD. 311 if (!mask_hdd_error) { 312 os_->ErrorReport(devicename.c_str(), "sector", 1); 313 error->severity_ = SAT_ERROR_FATAL; 314 return 1; 315 } 316 return 0; 317} 318