1// Copyright 2008 Google Inc. All Rights Reserved. 2 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6 7// http://www.apache.org/licenses/LICENSE-2.0 8 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// error_diag.h: Ambiguous error diagnosis class 16 17#ifndef STRESSAPPTEST_ERROR_DIAG_H_ 18#define STRESSAPPTEST_ERROR_DIAG_H_ 19 20#include <pthread.h> 21#include <list> 22#include <map> 23#include <set> 24#include <string> 25 26// This file must work with autoconf on its public version, 27// so these includes are correct. 28#include "sattypes.h" 29#include "os.h" 30 31class ErrorInstance; 32 33// This describes the components of the system. 34class DeviceTree { 35 public: 36 explicit DeviceTree(string name); 37 ~DeviceTree(); 38 39 // Atomically find arbitrary device in subtree. 40 DeviceTree *FindInSubTree(string name); 41 // Find or add named device. 42 DeviceTree *FindOrAddDevice(string name); 43 // Atomically add sub device. 44 void InsertSubDevice(string name); 45 // Returns parent device. 46 DeviceTree *GetParent() { return parent_; } 47 // Pretty prints device tree. 48 void PrettyPrint(string spacer = " "); 49 // Atomically add error instance to device. 50 void AddErrorInstance(ErrorInstance *error_instance); 51 // Returns true of device is known to be bad. 52 bool KnownBad(); 53 // Returns number of direct sub devices. 54 int NumDirectSubDevices() { return subdevices_.size(); } 55 56 private: 57 // Unlocked version of FindInSubTree. 58 DeviceTree *UnlockedFindInSubTree(string name); 59 60 std::map<string, DeviceTree*> subdevices_; // Map of sub-devices. 61 std::list<ErrorInstance*> errors_; // Log of errors. 62 DeviceTree *parent_; // Pointer to parent device. 63 string name_; // Device name. 64 pthread_mutex_t device_tree_mutex_; // Mutex protecting device tree. 65}; 66 67 68// enum type for collected errors. 69enum SATErrorType { 70 SAT_ERROR_NONE = 0, 71 SAT_ERROR_ECC, 72 SAT_ERROR_MISCOMPARE, 73 SAT_ERROR_SECTOR_TAG, 74}; 75 76// enum type for error severity. 77enum SATErrorSeverity { 78 SAT_ERROR_CORRECTABLE = 0, 79 SAT_ERROR_FATAL, 80}; 81 82// This describes an error and it's likely causes. 83class ErrorInstance { 84 public: 85 ErrorInstance(): type_(SAT_ERROR_NONE), severity_(SAT_ERROR_CORRECTABLE) {} 86 87 SATErrorType type_; // Type of error: ECC, miscompare, sector. 88 SATErrorSeverity severity_; // Correctable, or fatal. 89 std::set<DeviceTree*> causes_; // Devices that can cause this type of error. 90}; 91 92// This describes ECC errors. 93class ECCErrorInstance: public ErrorInstance { 94 public: 95 ECCErrorInstance() { type_ = SAT_ERROR_ECC; } 96 97 uint64 addr_; // Address where error occured. 98}; 99 100// This describes miscompare errors. 101class MiscompareErrorInstance: public ErrorInstance { 102 public: 103 MiscompareErrorInstance() { type_ = SAT_ERROR_MISCOMPARE; } 104 105 uint64 addr_; // Address where miscompare occured. 106}; 107 108// This describes HDD miscompare errors. 109class HDDMiscompareErrorInstance: public MiscompareErrorInstance { 110 public: 111 uint64 addr2_; // addr_ and addr2_ are src and dst memory addr. 112 int offset_; // offset. 113 int block_; // error block. 114}; 115 116// This describes HDD miscompare errors. 117class HDDSectorTagErrorInstance: public ErrorInstance { 118 public: 119 HDDSectorTagErrorInstance() { type_ = SAT_ERROR_SECTOR_TAG; } 120 121 uint64 addr_; 122 uint64 addr2_; // addr_ and addr2_ are src and dst memory addr. 123 int sector_; // error sector. 124 int block_; // error block. 125}; 126 127// Generic error storage and sorting class. 128class ErrorDiag { 129 public: 130 ErrorDiag(); 131 virtual ~ErrorDiag(); 132 133 // Add info about a CECC. 134 virtual int AddCeccError(string dimm_string); 135 136 // Add info about a UECC. 137 virtual int AddUeccError(string dimm_string); 138 139 // Add info about a miscompare. 140 virtual int AddMiscompareError(string dimm_string, uint64 addr, int count); 141 142 // Add info about a miscompare from a drive. 143 virtual int AddHDDMiscompareError(string devicename, int block, int offset, 144 void *src_addr, void *dst_addr); 145 146 // Add info about a sector tag miscompare from a drive. 147 virtual int AddHDDSectorTagError(string devicename, int block, int offset, 148 int sector, void *src_addr, void *dst_addr); 149 150 // Set platform specific handle and initialize device tree. 151 bool set_os(OsLayer *os); 152 153 protected: 154 // Create and initialize system device tree. 155 virtual bool InitializeDeviceTree(); 156 157 // Utility Function to translate a virtual address to DIMM number. 158 string AddressToDimmString(OsLayer *os, void *addr, int offset); 159 160 DeviceTree *system_tree_root_; // System device tree. 161 OsLayer *os_; // Platform handle. 162 163 private: 164 DISALLOW_COPY_AND_ASSIGN(ErrorDiag); 165}; 166 167#endif // STRESSAPPTEST_ERROR_DIAG_H_ 168