1// Copyright 2008 Google Inc. All Rights Reserved.
2
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6
7//      http://www.apache.org/licenses/LICENSE-2.0
8
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// error_diag.h: Ambiguous error diagnosis class
16
17#ifndef STRESSAPPTEST_ERROR_DIAG_H_
18#define STRESSAPPTEST_ERROR_DIAG_H_
19
20#include <pthread.h>
21#include <list>
22#include <map>
23#include <set>
24#include <string>
25
26// This file must work with autoconf on its public version,
27// so these includes are correct.
28#include "sattypes.h"
29#include "os.h"
30
31class ErrorInstance;
32
33// This describes the components of the system.
34class DeviceTree {
35 public:
36  explicit DeviceTree(string name);
37  ~DeviceTree();
38
39  // Atomically find arbitrary device in subtree.
40  DeviceTree *FindInSubTree(string name);
41  // Find or add named device.
42  DeviceTree *FindOrAddDevice(string name);
43  // Atomically add sub device.
44  void InsertSubDevice(string name);
45  // Returns parent device.
46  DeviceTree *GetParent() { return parent_; }
47  // Pretty prints device tree.
48  void PrettyPrint(string spacer = " ");
49  // Atomically add error instance to device.
50  void AddErrorInstance(ErrorInstance *error_instance);
51  // Returns true of device is known to be bad.
52  bool KnownBad();
53  // Returns number of direct sub devices.
54  int NumDirectSubDevices() { return subdevices_.size(); }
55
56 private:
57  // Unlocked version of FindInSubTree.
58  DeviceTree *UnlockedFindInSubTree(string name);
59
60  std::map<string, DeviceTree*> subdevices_;    // Map of sub-devices.
61  std::list<ErrorInstance*> errors_;            // Log of errors.
62  DeviceTree *parent_;                          // Pointer to parent device.
63  string name_;                                 // Device name.
64  pthread_mutex_t device_tree_mutex_;           // Mutex protecting device tree.
65};
66
67
68// enum type for collected errors.
69enum SATErrorType {
70  SAT_ERROR_NONE = 0,
71  SAT_ERROR_ECC,
72  SAT_ERROR_MISCOMPARE,
73  SAT_ERROR_SECTOR_TAG,
74};
75
76// enum type for error severity.
77enum SATErrorSeverity {
78  SAT_ERROR_CORRECTABLE = 0,
79  SAT_ERROR_FATAL,
80};
81
82// This describes an error and it's likely causes.
83class ErrorInstance {
84 public:
85  ErrorInstance(): type_(SAT_ERROR_NONE), severity_(SAT_ERROR_CORRECTABLE) {}
86
87  SATErrorType type_;             // Type of error: ECC, miscompare, sector.
88  SATErrorSeverity severity_;     // Correctable, or fatal.
89  std::set<DeviceTree*> causes_;  // Devices that can cause this type of error.
90};
91
92// This describes ECC errors.
93class ECCErrorInstance: public ErrorInstance {
94 public:
95  ECCErrorInstance() { type_ = SAT_ERROR_ECC; }
96
97  uint64 addr_;               // Address where error occured.
98};
99
100// This describes miscompare errors.
101class MiscompareErrorInstance: public ErrorInstance {
102 public:
103  MiscompareErrorInstance() { type_ = SAT_ERROR_MISCOMPARE; }
104
105  uint64 addr_;               // Address where miscompare occured.
106};
107
108// This describes HDD miscompare errors.
109class HDDMiscompareErrorInstance: public MiscompareErrorInstance {
110 public:
111  uint64 addr2_;             // addr_ and addr2_ are src and dst memory addr.
112  int offset_;               // offset.
113  int block_;                // error block.
114};
115
116// This describes HDD miscompare errors.
117class HDDSectorTagErrorInstance: public ErrorInstance {
118 public:
119  HDDSectorTagErrorInstance() { type_ = SAT_ERROR_SECTOR_TAG; }
120
121  uint64 addr_;
122  uint64 addr2_;             // addr_ and addr2_ are src and dst memory addr.
123  int sector_;               // error sector.
124  int block_;                // error block.
125};
126
127// Generic error storage and sorting class.
128class ErrorDiag {
129 public:
130  ErrorDiag();
131  virtual ~ErrorDiag();
132
133  // Add info about a CECC.
134  virtual int AddCeccError(string dimm_string);
135
136  // Add info about a UECC.
137  virtual int AddUeccError(string dimm_string);
138
139  // Add info about a miscompare.
140  virtual int AddMiscompareError(string dimm_string, uint64 addr, int count);
141
142  // Add info about a miscompare from a drive.
143  virtual int AddHDDMiscompareError(string devicename, int block, int offset,
144                            void *src_addr, void *dst_addr);
145
146  // Add info about a sector tag miscompare from a drive.
147  virtual int AddHDDSectorTagError(string devicename, int block, int offset,
148                           int sector, void *src_addr, void *dst_addr);
149
150  // Set platform specific handle and initialize device tree.
151  bool set_os(OsLayer *os);
152
153 protected:
154  // Create and initialize system device tree.
155  virtual bool InitializeDeviceTree();
156
157  // Utility Function to translate a virtual address to DIMM number.
158  string AddressToDimmString(OsLayer *os, void *addr, int offset);
159
160  DeviceTree *system_tree_root_;  // System device tree.
161  OsLayer *os_;                   // Platform handle.
162
163 private:
164  DISALLOW_COPY_AND_ASSIGN(ErrorDiag);
165};
166
167#endif  // STRESSAPPTEST_ERROR_DIAG_H_
168