summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShay Drory <shayd@mellanox.com>2020-05-07 09:32:53 +0300
committerSaeed Mahameed <saeedm@mellanox.com>2020-06-11 15:37:51 -0700
commitb6e0b6bebe0732d5cac51f0791f269d2413b8980 (patch)
tree384313d7b2a6d341e52a5f933454c2118d79afe1
parent42ea9f1b5c625fad225d4ac96a7e757dd4199d9c (diff)
downloadlinux-b6e0b6bebe0732d5cac51f0791f269d2413b8980.tar.gz
linux-b6e0b6bebe0732d5cac51f0791f269d2413b8980.tar.xz
net/mlx5: Fix fatal error handling during device load
Currently, in case of fatal error during mlx5_load_one(), we cannot enter error state until mlx5_load_one() is finished, what can take several minutes until commands will get timeouts, because these commands can't be processed due to the fatal error. Fix it by setting dev->state as MLX5_DEVICE_STATE_INTERNAL_ERROR before requesting the lock. Fixes: c1d4d2e92ad6 ("net/mlx5: Avoid calling sleeping function by the health poll thread") Signed-off-by: Shay Drory <shayd@mellanox.com> Reviewed-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c14
1 files changed, 11 insertions, 3 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index c0cfbab15fe9..b31f769d2df9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -192,15 +192,23 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
{
+ bool err_detected = false;
+
+ /* Mark the device as fatal in order to abort FW commands */
+ if ((check_fatal_sensors(dev) || force) &&
+ dev->state == MLX5_DEVICE_STATE_UP) {
+ dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+ err_detected = true;
+ }
mutex_lock(&dev->intf_state_mutex);
- if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
- goto unlock;
+ if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+ goto unlock;/* a previous error is still being handled */
if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) {
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
goto unlock;
}
- if (check_fatal_sensors(dev) || force) {
+ if (check_fatal_sensors(dev) || force) { /* protected state setting */
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
mlx5_cmd_flush(dev);
}