misc/scsi-timeout/scterc.sh

86 lines
3.5 KiB
Bash
Raw Permalink Normal View History

2024-12-28 17:26:48 +01:00
#!/bin/sh
###
# Set SCSI and SATA SCT error timeout.
2024-12-28 17:26:48 +01:00
#
# SPDX-License-Identifier: CC0-1.0
###
###
# Description
#
# Configures SCTERC timeout for all disks, setting it to 7 seconds (70
# deciseconds) for improved error handling. If SCTERC is unsupported,
# then Linux I/O timeout and the SCSI error handler (EH) timeouts are
# increased.
#
# SCTERC (SCT Error Recovery Control) allows the disk to limit its
# internal recovery time, ensuring that it returns an error promptly if
# the operation cannot be completed, for example due to a bad sector.
# This prevents the SCSI layer from triggering high-level resets (e.g.,
# LUN, bus, or host resets) that could lead to data loss or filesystem
# corruption. If SCTERC is not supported, setting a long "timeout"
# helps prevent premature EH invocation.
#
2024-12-28 17:26:48 +01:00
# In Linux, the "timeout" value is how long the kernel waits for an
# individual I/O command to complete before declaring it as failed and
# invoking the SCSI Error Handler (EH). Once EH takes over, its
# behaviour is governed by the driver implementation. The "eh_timeout"
# parameter defines how long the EH is allowed to try recovery
# operations before escalating further or offlining the device.
#
# It is not uncommon to see I/O on normal HDDs taking more than
# 30 seconds, the Linux default. To accommodate this, the script sets
# the default "timeout" to 60 seconds for devices with SCTERC support.
# For devices without SCTERC, the fallback timeout is set to 300 seconds.
#
# Some SMR (Shingled Magnetic Recording) type HDDs are especially
2024-12-28 17:26:48 +01:00
# prone to trigger Linux I/O timeouts, as their internal garbage
# collection can take several minutes to complete. You may need to
# increase the "sct_device_timeout" and "fallback_device_timeout" for
# such devices.
2024-12-28 17:26:48 +01:00
#
# See the Linux documentation for SCSI error handling at:
# https://www.kernel.org/doc/Documentation/scsi/scsi_eh.rst
###
scterc_value=70 # SCTERC value in deciseconds (7 seconds)
sct_device_timeout=60 # 60s for devices with SCTERC support
sct_eh_recovery_timeout=10 # 10s for EH recovery with SCTERC
fallback_device_timeout=300 # 300s for devices without SCTERC
fallback_eh_recovery_timeout=30 # 30s for EH recovery without SCTERC
2024-12-28 17:26:48 +01:00
# Print header
printf "%-10s %-40s %-30s\n" "Device" "Model" "Status"
echo "------------------------------------------------------------------"
# Iterate over /dev/sd[a-z] to target standard SCSI/SATA devices. If you
# need to include other block devices (e.g., virtio disks: /dev/vd[a-z],
# Xen disks: /dev/xvd[a-z]), adjust the glob pattern accordingly. For
# systems with more than 26 SCSI devices, add additional patterns, such
# as:
# for i in /dev/sd[a-z] /dev/sda[a-z] ; do
2024-12-28 17:26:48 +01:00
for i in /dev/sd[a-z] ; do
device=$(basename "$i")
# Attempt to set the SCTERC timeout to 7 seconds
output=$(smartctl -l scterc,$scterc_value,$scterc_value "$i" 2>&1)
# Get the device model
model=$(smartctl -i "$i" | grep -E "Device Model|Product:" | awk -F: '{print $2}' | xargs)
2024-12-28 17:26:48 +01:00
# Check the output for "SCT Commands not supported"
if echo "$output" | grep -q "SCT Commands not supported" ; then
status="No SCTERC support, using fallback"
echo $fallback_device_timeout > "/sys/block/${device}/device/timeout"
echo $fallback_eh_recovery_timeout > "/sys/block/${device}/device/eh_timeout"
2024-12-28 17:26:48 +01:00
else
status="SCTERC set ok"
echo $sct_device_timeout > "/sys/block/${device}/device/timeout"
echo $sct_eh_recovery_timeout > "/sys/block/${device}/device/eh_timeout"
2024-12-28 17:26:48 +01:00
fi
# Print the results
printf "%-10s %-40s %-30s\n" "$i" "$model" "$status"
done