1/* 2 * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33#include <linux/kernel.h> 34#include <linux/module.h> 35#include <linux/random.h> 36#include <linux/vmalloc.h> 37#include <linux/mlx5/driver.h> 38#include <linux/mlx5/cmd.h> 39#include "mlx5_core.h" 40 41enum { 42 MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, 43 MAX_MISSES = 3, 44}; 45 46enum { 47 MLX5_HEALTH_SYNDR_FW_ERR = 0x1, 48 MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7, 49 MLX5_HEALTH_SYNDR_CRC_ERR = 0x9, 50 MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa, 51 MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb, 52 MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc, 53 MLX5_HEALTH_SYNDR_EQ_ERR = 0xd, 54 MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf, 55}; 56 57static DEFINE_SPINLOCK(health_lock); 58static LIST_HEAD(health_list); 59static struct work_struct health_work; 60 61static void health_care(struct work_struct *work) 62{ 63 struct mlx5_core_health *health, *n; 64 struct mlx5_core_dev *dev; 65 struct mlx5_priv *priv; 66 LIST_HEAD(tlist); 67 68 spin_lock_irq(&health_lock); 69 list_splice_init(&health_list, &tlist); 70 71 spin_unlock_irq(&health_lock); 72 73 list_for_each_entry_safe(health, n, &tlist, list) { 74 priv = container_of(health, struct mlx5_priv, health); 75 dev = container_of(priv, struct mlx5_core_dev, priv); 76 mlx5_core_warn(dev, "handling bad device here\n"); 77 /* nothing yet */ 78 spin_lock_irq(&health_lock); 79 list_del_init(&health->list); 80 spin_unlock_irq(&health_lock); 81 } 82} 83 84static const char *hsynd_str(u8 synd) 85{ 86 switch (synd) { 87 case MLX5_HEALTH_SYNDR_FW_ERR: 88 return "firmware internal error"; 89 case MLX5_HEALTH_SYNDR_IRISC_ERR: 90 return "irisc not responding"; 91 case MLX5_HEALTH_SYNDR_CRC_ERR: 92 return "firmware CRC error"; 93 case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 94 return "ICM fetch PCI error"; 95 case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 96 return "HW fatal error\n"; 97 case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 98 return "async EQ buffer overrun"; 99 case MLX5_HEALTH_SYNDR_EQ_ERR: 100 return "EQ error"; 101 case MLX5_HEALTH_SYNDR_FFSER_ERR: 102 return "FFSER error"; 103 default: 104 return "unrecognized error"; 105 } 106} 107 108static u16 read_be16(__be16 __iomem *p) 109{ 110 return swab16(readl((__force u16 __iomem *) p)); 111} 112 113static u32 read_be32(__be32 __iomem *p) 114{ 115 return swab32(readl((__force u32 __iomem *) p)); 116} 117 118static void print_health_info(struct mlx5_core_dev *dev) 119{ 120 struct mlx5_core_health *health = &dev->priv.health; 121 struct health_buffer __iomem *h = health->health; 122 int i; 123 124 for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) 125 pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i)); 126 127 pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr)); 128 pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra)); 129 pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver)); 130 pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id)); 131 pr_info("irisc_index %d\n", readb(&h->irisc_index)); 132 pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd))); 133 pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync)); 134} 135 136static void poll_health(unsigned long data) 137{ 138 struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 139 struct mlx5_core_health *health = &dev->priv.health; 140 unsigned long next; 141 u32 count; 142 143 count = ioread32be(health->health_counter); 144 if (count == health->prev) 145 ++health->miss_counter; 146 else 147 health->miss_counter = 0; 148 149 health->prev = count; 150 if (health->miss_counter == MAX_MISSES) { 151 mlx5_core_err(dev, "device's health compromised\n"); 152 print_health_info(dev); 153 spin_lock_irq(&health_lock); 154 list_add_tail(&health->list, &health_list); 155 spin_unlock_irq(&health_lock); 156 157 queue_work(mlx5_core_wq, &health_work); 158 } else { 159 get_random_bytes(&next, sizeof(next)); 160 next %= HZ; 161 next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 162 mod_timer(&health->timer, next); 163 } 164} 165 166void mlx5_start_health_poll(struct mlx5_core_dev *dev) 167{ 168 struct mlx5_core_health *health = &dev->priv.health; 169 170 INIT_LIST_HEAD(&health->list); 171 init_timer(&health->timer); 172 health->health = &dev->iseg->health; 173 health->health_counter = &dev->iseg->health_counter; 174 175 health->timer.data = (unsigned long)dev; 176 health->timer.function = poll_health; 177 health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL); 178 add_timer(&health->timer); 179} 180 181void mlx5_stop_health_poll(struct mlx5_core_dev *dev) 182{ 183 struct mlx5_core_health *health = &dev->priv.health; 184 185 del_timer_sync(&health->timer); 186 187 spin_lock_irq(&health_lock); 188 if (!list_empty(&health->list)) 189 list_del_init(&health->list); 190 spin_unlock_irq(&health_lock); 191} 192 193void mlx5_health_cleanup(void) 194{ 195} 196 197void __init mlx5_health_init(void) 198{ 199 INIT_WORK(&health_work, health_care); 200} 201