191037db265ecdd914a26e056cf69207b4f50924ehkuang// Copyright 2008 Google Inc. All Rights Reserved.
291037db265ecdd914a26e056cf69207b4f50924ehkuang
391037db265ecdd914a26e056cf69207b4f50924ehkuang// Licensed under the Apache License, Version 2.0 (the "License");
491037db265ecdd914a26e056cf69207b4f50924ehkuang// you may not use this file except in compliance with the License.
591037db265ecdd914a26e056cf69207b4f50924ehkuang// You may obtain a copy of the License at
691037db265ecdd914a26e056cf69207b4f50924ehkuang
791037db265ecdd914a26e056cf69207b4f50924ehkuang//      http://www.apache.org/licenses/LICENSE-2.0
891037db265ecdd914a26e056cf69207b4f50924ehkuang
991037db265ecdd914a26e056cf69207b4f50924ehkuang// Unless required by applicable law or agreed to in writing, software
1091037db265ecdd914a26e056cf69207b4f50924ehkuang// distributed under the License is distributed on an "AS IS" BASIS,
1191037db265ecdd914a26e056cf69207b4f50924ehkuang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1291037db265ecdd914a26e056cf69207b4f50924ehkuang// See the License for the specific language governing permissions and
133df0563f1b24dac6c0bd122fc922a48211269061hkuang// limitations under the License.
1491037db265ecdd914a26e056cf69207b4f50924ehkuang
1591037db265ecdd914a26e056cf69207b4f50924ehkuang// error_diag.cc: Collects device errors for analysis to more accurately
1691037db265ecdd914a26e056cf69207b4f50924ehkuang//                pin-point failed component.
1791037db265ecdd914a26e056cf69207b4f50924ehkuang
1891037db265ecdd914a26e056cf69207b4f50924ehkuang#include <set>
1991037db265ecdd914a26e056cf69207b4f50924ehkuang#include <list>
2091037db265ecdd914a26e056cf69207b4f50924ehkuang#include <map>
2191037db265ecdd914a26e056cf69207b4f50924ehkuang
2291037db265ecdd914a26e056cf69207b4f50924ehkuang// This file must work with autoconf on its public version,
233df0563f1b24dac6c0bd122fc922a48211269061hkuang// so these includes are correct.
2491037db265ecdd914a26e056cf69207b4f50924ehkuang#include "error_diag.h"
2591037db265ecdd914a26e056cf69207b4f50924ehkuang#include "sattypes.h"
2691037db265ecdd914a26e056cf69207b4f50924ehkuang
2791037db265ecdd914a26e056cf69207b4f50924ehkuang
2891037db265ecdd914a26e056cf69207b4f50924ehkuang// DeviceTree constructor.
2991037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree::DeviceTree(string name)
3091037db265ecdd914a26e056cf69207b4f50924ehkuang  : parent_(0), name_(name) {
3191037db265ecdd914a26e056cf69207b4f50924ehkuang  pthread_mutex_init(&device_tree_mutex_, NULL);
3291037db265ecdd914a26e056cf69207b4f50924ehkuang}
3391037db265ecdd914a26e056cf69207b4f50924ehkuang
3491037db265ecdd914a26e056cf69207b4f50924ehkuang// DeviceTree destructor.
3591037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree::~DeviceTree() {
3691037db265ecdd914a26e056cf69207b4f50924ehkuang  // Deallocate subtree devices.
3791037db265ecdd914a26e056cf69207b4f50924ehkuang  for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
3891037db265ecdd914a26e056cf69207b4f50924ehkuang      itr != subdevices_.end();
3991037db265ecdd914a26e056cf69207b4f50924ehkuang      ++itr) {
4091037db265ecdd914a26e056cf69207b4f50924ehkuang    delete itr->second;
4191037db265ecdd914a26e056cf69207b4f50924ehkuang  }
4291037db265ecdd914a26e056cf69207b4f50924ehkuang  // Deallocate device errors.
4391037db265ecdd914a26e056cf69207b4f50924ehkuang  for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
4491037db265ecdd914a26e056cf69207b4f50924ehkuang      itr != errors_.end();
4591037db265ecdd914a26e056cf69207b4f50924ehkuang      ++itr) {
4691037db265ecdd914a26e056cf69207b4f50924ehkuang    delete (*itr);
4791037db265ecdd914a26e056cf69207b4f50924ehkuang  }
4891037db265ecdd914a26e056cf69207b4f50924ehkuang  pthread_mutex_destroy(&device_tree_mutex_);
4991037db265ecdd914a26e056cf69207b4f50924ehkuang}
5091037db265ecdd914a26e056cf69207b4f50924ehkuang
5191037db265ecdd914a26e056cf69207b4f50924ehkuang// Atomically find named device in sub device tree.
5291037db265ecdd914a26e056cf69207b4f50924ehkuang// Returns 0 if not found
5391037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree *DeviceTree::FindInSubTree(string name) {
5491037db265ecdd914a26e056cf69207b4f50924ehkuang  DeviceTree *ret;
5591037db265ecdd914a26e056cf69207b4f50924ehkuang  pthread_mutex_lock(&device_tree_mutex_);
5691037db265ecdd914a26e056cf69207b4f50924ehkuang  ret = UnlockedFindInSubTree(name);
573df0563f1b24dac6c0bd122fc922a48211269061hkuang  pthread_mutex_unlock(&device_tree_mutex_);
5891037db265ecdd914a26e056cf69207b4f50924ehkuang  return ret;
5991037db265ecdd914a26e056cf69207b4f50924ehkuang}
6091037db265ecdd914a26e056cf69207b4f50924ehkuang
6191037db265ecdd914a26e056cf69207b4f50924ehkuang// Find named device in sub device tree (Non-atomic).
6291037db265ecdd914a26e056cf69207b4f50924ehkuang// Returns 0 if not found
6391037db265ecdd914a26e056cf69207b4f50924ehkuangDeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
6491037db265ecdd914a26e056cf69207b4f50924ehkuang  std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
6591037db265ecdd914a26e056cf69207b4f50924ehkuang  if (itr != subdevices_.end()) {
6691037db265ecdd914a26e056cf69207b4f50924ehkuang    return itr->second;
6791037db265ecdd914a26e056cf69207b4f50924ehkuang  } else {
6891037db265ecdd914a26e056cf69207b4f50924ehkuang    // Search sub-tree.
6991037db265ecdd914a26e056cf69207b4f50924ehkuang    for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
7091037db265ecdd914a26e056cf69207b4f50924ehkuang        itr != subdevices_.end();
7191037db265ecdd914a26e056cf69207b4f50924ehkuang        ++itr) {
7291037db265ecdd914a26e056cf69207b4f50924ehkuang      DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
7391037db265ecdd914a26e056cf69207b4f50924ehkuang      if (result != 0)
7491037db265ecdd914a26e056cf69207b4f50924ehkuang        return result;
7591037db265ecdd914a26e056cf69207b4f50924ehkuang    }
7691037db265ecdd914a26e056cf69207b4f50924ehkuang    return 0;
7791037db265ecdd914a26e056cf69207b4f50924ehkuang  }
7891037db265ecdd914a26e056cf69207b4f50924ehkuang}
79
80// Atomically add error instance to device.
81void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
82  pthread_mutex_lock(&device_tree_mutex_);
83  errors_.push_back(error_instance);
84  pthread_mutex_unlock(&device_tree_mutex_);
85}
86
87// Find or add queried device as necessary.
88DeviceTree *DeviceTree::FindOrAddDevice(string name) {
89  // Assume named device does not exist and try to insert the device anyway.
90  // No-op if named device already exists.
91  InsertSubDevice(name);
92  // Find and return sub device pointer.
93  return FindInSubTree(name);
94}
95
96// Pretty prints device tree.
97void DeviceTree::PrettyPrint(string spacer) {
98  for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
99      itr != subdevices_.end();
100      ++itr) {
101    printf("%s%s\n", spacer.c_str(), itr->first.c_str());
102    itr->second->PrettyPrint(spacer+spacer);
103  }
104}
105
106// Atomically add sub device.
107// No-op if named device already exists.
108void DeviceTree::InsertSubDevice(string name) {
109  pthread_mutex_lock(&device_tree_mutex_);
110  if (UnlockedFindInSubTree(name) != 0) {
111    pthread_mutex_unlock(&device_tree_mutex_);
112    return;
113  }
114  subdevices_[name] = new DeviceTree(name);
115  subdevices_[name]->parent_ = this;
116  pthread_mutex_unlock(&device_tree_mutex_);
117}
118
119
120// Returns true of any error associated with this device is fatal.
121bool DeviceTree::KnownBad() {
122  pthread_mutex_lock(&device_tree_mutex_);
123  for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
124      itr != errors_.end();
125      ++itr) {
126    if ((*itr)->severity_ == SAT_ERROR_FATAL) {
127      pthread_mutex_unlock(&device_tree_mutex_);
128      return true;
129    }
130  }
131  pthread_mutex_unlock(&device_tree_mutex_);
132  return false;
133}
134
135
136// ErrorDiag constructor.
137ErrorDiag::ErrorDiag() {
138  os_ = 0;
139  system_tree_root_ = 0;
140}
141
142// ErrorDiag destructor.
143ErrorDiag::~ErrorDiag() {
144  if (system_tree_root_)
145    delete system_tree_root_;
146}
147
148// Set platform specific handle and initialize device tree.
149// Returns false on error. true otherwise.
150bool ErrorDiag::set_os(OsLayer *os) {
151  os_ = os;
152  return(InitializeDeviceTree());
153}
154
155// Create and initialize system device tree.
156// Returns false on error. true otherwise.
157bool ErrorDiag::InitializeDeviceTree() {
158  system_tree_root_ = new DeviceTree("system_root");
159  if (!system_tree_root_)
160    return false;
161  return true;
162}
163
164// Logs info about a CECC.
165// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
166int ErrorDiag::AddCeccError(string dimm_string) {
167  DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
168  ECCErrorInstance *error = new ECCErrorInstance;
169  if (!error)
170    return -1;
171  error->severity_ = SAT_ERROR_CORRECTABLE;
172  dimm_device->AddErrorInstance(error);
173  return 0;
174}
175
176// Logs info about a UECC.
177// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
178int ErrorDiag::AddUeccError(string dimm_string) {
179  DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
180  ECCErrorInstance *error = new ECCErrorInstance;
181  if (!error)
182    return -1;
183  error->severity_ = SAT_ERROR_FATAL;
184  dimm_device->AddErrorInstance(error);
185  return 0;
186}
187
188// Logs info about a miscompare.
189// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
190int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
191  DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
192  MiscompareErrorInstance *error = new MiscompareErrorInstance;
193  if (!error)
194    return -1;
195  error->severity_ = SAT_ERROR_FATAL;
196  error->addr_ = addr;
197  dimm_device->AddErrorInstance(error);
198  os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
199  return 1;
200}
201
202// Utility Function to translate a virtual address to DIMM number.
203// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
204string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
205  char dimm_string[256] = "";
206  char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
207  uint64 paddr = os->VirtualToPhysical(vbyteaddr);
208  os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
209  return string(dimm_string);
210}
211
212// Info about a miscompare from a drive.
213// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
214int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
215                                     void *src_addr, void *dst_addr) {
216  bool mask_hdd_error = false;
217
218  HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
219  if (!error)
220    return -1;
221
222  error->addr_ = reinterpret_cast<uint64>(src_addr);
223  error->addr2_ = reinterpret_cast<uint64>(dst_addr);
224  error->offset_ = offset;
225  error->block_ = block;
226
227  string src_dimm = AddressToDimmString(os_, src_addr, offset);
228  string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
229
230  // DIMM name look up success
231  if (src_dimm.compare("DIMM Unknown")) {
232    // Add src DIMM as possible miscompare cause.
233    DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
234    error->causes_.insert(src_dimm_dev);
235    if (src_dimm_dev->KnownBad()) {
236      mask_hdd_error = true;
237      logprintf(5, "Log: supressed %s miscompare report: "
238                "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
239    }
240  }
241  if (dst_dimm.compare("DIMM Unknown")) {
242    // Add dst DIMM as possible miscompare cause.
243    DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
244    error->causes_.insert(dst_dimm_dev);
245    if (dst_dimm_dev->KnownBad()) {
246      mask_hdd_error = true;
247      logprintf(5, "Log: supressed %s miscompare report: "
248                "known bad destination: %s\n", devicename.c_str(),
249                dst_dimm.c_str());
250    }
251  }
252
253  DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
254  hdd_dev->AddErrorInstance(error);
255
256  // HDD error was not masked by bad DIMMs: report bad HDD.
257  if (!mask_hdd_error) {
258    os_->ErrorReport(devicename.c_str(), "miscompare", 1);
259    error->severity_ = SAT_ERROR_FATAL;
260    return 1;
261  }
262  return 0;
263}
264
265// Info about a sector tag miscompare from a drive.
266// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
267int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
268                                    int sector, void *src_addr,
269                                    void *dst_addr) {
270  bool mask_hdd_error = false;
271
272  HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
273  if (!error)
274    return -1;
275
276  error->addr_ = reinterpret_cast<uint64>(src_addr);
277  error->addr2_ = reinterpret_cast<uint64>(dst_addr);
278  error->sector_ = sector;
279  error->block_ = block;
280
281  string src_dimm = AddressToDimmString(os_, src_addr, offset);
282  string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
283
284  // DIMM name look up success
285  if (src_dimm.compare("DIMM Unknown")) {
286    // Add src DIMM as possible miscompare cause.
287    DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
288    error->causes_.insert(src_dimm_dev);
289    if (src_dimm_dev->KnownBad()) {
290      mask_hdd_error = true;
291      logprintf(5, "Log: supressed %s sector tag error report: "
292                "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
293    }
294  }
295  if (dst_dimm.compare("DIMM Unknown")) {
296    // Add dst DIMM as possible miscompare cause.
297    DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
298    error->causes_.insert(dst_dimm_dev);
299    if (dst_dimm_dev->KnownBad()) {
300      mask_hdd_error = true;
301      logprintf(5, "Log: supressed %s sector tag error report: "
302                "known bad destination: %s\n", devicename.c_str(),
303                dst_dimm.c_str());
304    }
305  }
306
307  DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
308  hdd_dev->AddErrorInstance(error);
309
310  // HDD error was not masked by bad DIMMs: report bad HDD.
311  if (!mask_hdd_error) {
312    os_->ErrorReport(devicename.c_str(), "sector", 1);
313    error->severity_ = SAT_ERROR_FATAL;
314    return 1;
315  }
316  return 0;
317}
318