agregar utilidad para ssd

This commit is contained in:
Luis Guzmán 2026-01-06 06:49:04 -06:00
parent 23fc456614
commit 74e778345e
1 changed files with 573 additions and 0 deletions

573
ssd-health.sh Normal file
View File

@ -0,0 +1,573 @@
#!/usr/bin/env bash
# ssd-health.sh — SSD/NVMe health + wear + filesystem usage summary
# Exit codes: 0=OK, 1=WARNING/INCONCLUSIVE, 2=CRITICAL
set -euo pipefail
MODE="text" # text|json
DETAIL=0
ONLY_DEV="" # user requested device (may be /dev/sda, sda, /dev/sda1, etc.)
TIMEOUT_SECS=8
usage() {
cat <<'EOF'
Usage:
ssd-health.sh [--json] [--detail] [--dev <blockdev>] [--timeout <secs>]
Notes:
--dev accepts: sda | /dev/sda | nvme0n1 | /dev/nvme0n1 | /dev/sda1 (partition -> parent disk)
Examples:
sudo ./ssd-health.sh
sudo ./ssd-health.sh --json
sudo ./ssd-health.sh --dev /dev/sda
sudo ./ssd-health.sh --dev /dev/nvme0n1
EOF
}
need_cmd() { command -v "$1" >/dev/null 2>&1; }
while [[ $# -gt 0 ]]; do
case "$1" in
--json) MODE="json"; shift ;;
--detail) DETAIL=1; shift ;;
--dev|--disk) ONLY_DEV="${2:-}"; shift 2 ;;
--timeout) TIMEOUT_SECS="${2:-8}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown arg: $1" >&2; usage; exit 2 ;;
esac
done
if ! need_cmd lsblk || ! need_cmd awk || ! need_cmd sed || ! need_cmd grep; then
echo "Missing required base commands (lsblk/awk/sed/grep)." >&2
exit 2
fi
if ! need_cmd smartctl; then
echo "Missing smartctl. Install smartmontools:" >&2
echo " sudo apt-get update && sudo apt-get install -y smartmontools" >&2
exit 2
fi
# Normalize user input into a disk name (e.g. sda, nvme0n1).
# Accepts /dev/sda, sda, /dev/sda1 (partition), etc.
normalize_to_disk_name() {
local arg="$1"
[[ -z "$arg" ]] && return 1
# Strip /dev/ prefix if present.
arg="${arg#/dev/}"
# If it's a partition, map to parent disk using lsblk.
# Example: sda1 -> sda, nvme0n1p2 -> nvme0n1
if lsblk -no TYPE "/dev/$arg" 2>/dev/null | grep -qx "part"; then
local parent
parent="$(lsblk -no PKNAME "/dev/$arg" 2>/dev/null | head -n1 || true)"
[[ -n "$parent" ]] && { echo "$parent"; return 0; }
fi
# If it's already a disk, return it.
if lsblk -no TYPE "/dev/$arg" 2>/dev/null | grep -qx "disk"; then
echo "$arg"
return 0
fi
# Unknown.
return 1
}
# Determine SMART device type (if any) via smartctl scan-open.
smart_scan_type_for() {
local dev="/dev/$1"
local line
line="$(smartctl --scan-open 2>/dev/null | awk -v d="$dev" '$1==d {print; exit}')" || true
if [[ -n "$line" ]]; then
echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="-d") {print $(i+1); exit}}'
else
echo ""
fi
}
smart_run() {
local name="$1"
local dev="/dev/$name"
local dtype="$2"
local args=(-a -H -i -A -n standby)
if need_cmd timeout; then
if [[ -n "$dtype" ]]; then
timeout "${TIMEOUT_SECS}s" smartctl "${args[@]}" -d "$dtype" "$dev" 2>/dev/null || true
else
timeout "${TIMEOUT_SECS}s" smartctl "${args[@]}" "$dev" 2>/dev/null || true
fi
else
if [[ -n "$dtype" ]]; then
smartctl "${args[@]}" -d "$dtype" "$dev" 2>/dev/null || true
else
smartctl "${args[@]}" "$dev" 2>/dev/null || true
fi
fi
}
# Detect if smartctl output is usable (bridge might block SMART, timeout, etc.)
smart_access_state() {
# Heuristics: if we see the START OF INFORMATION SECTION or NVMe SMART section, it's likely usable.
awk '
BEGIN {found=0}
/=== START OF INFORMATION SECTION ===/ {print "OK"; found=1; exit}
/SMART\/Health Information/ {print "OK"; found=1; exit}
/NVMe Log 0x02/ {print "OK"; found=1; exit}
/Device does not support SMART/ {print "UNSUPPORTED"; found=1; exit}
/Unknown USB bridge/ {print "UNSUPPORTED"; found=1; exit}
END { if (!found) print "UNAVAILABLE" }
'
}
parse_health() {
awk '
/overall-health self-assessment test result:/ {
if ($0 ~ /PASSED|OK/) print "PASSED"; else print "FAILED"; found=1; exit
}
/SMART Health Status:/ {
if ($0 ~ /OK/) print "PASSED"; else print "FAILED"; found=1; exit
}
END { if (!found) print "UNKNOWN" }
'
}
parse_temp_c() {
awk '
/Temperature:/ && $0 ~ /Celsius/ { for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) {print $i; exit} }
/Current Drive Temperature:/ { for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) {print $i; exit} }
/Temperature_Celsius/ {print $10; exit}
/Temperature_Internal/ {print $10; exit}
END { }
'
}
parse_ata_attr_raw() {
local attr="$1"
awk -v a="$attr" '$2==a {print $10; exit}'
}
parse_nvme_int() {
local label="$1"
awk -v l="$label" '
index($0,l":")==1 {
gsub(/[^0-9]/,"");
if ($0=="") print ""; else print $0;
exit
}
'
}
parse_nvme_bytes_units_written() {
awk '
index($0,"Data Units Written:")==1 {
match($0, /\[[0-9.,]+[[:space:]]*[A-Za-z]+\]/);
if (RSTART>0) { print substr($0, RSTART+1, RLENGTH-2); } else print "";
exit
}
'
}
parse_nvme_bytes_units_read() {
awk '
index($0,"Data Units Read:")==1 {
match($0, /\[[0-9.,]+[[:space:]]*[A-Za-z]+\]/);
if (RSTART>0) { print substr($0, RSTART+1, RLENGTH-2); } else print "";
exit
}
'
}
json_escape() {
python3 - <<'PY' "$1"
import json,sys
print(json.dumps(sys.argv[1]))
PY
}
# Determine wear percentage if available.
# Returns: "used_pct|remaining_pct|source|confidence"
# - NVMe: used_pct is Percentage Used (0..100) -> REAL
# - ATA: tries known life attributes; if found and looks like 0..100, treated as "remaining" (BEST EFFORT)
wear_probe() {
local smart_out="$1"
# NVMe (reliable)
local nvme_used
nvme_used="$(printf "%s\n" "$smart_out" | parse_nvme_int "Percentage Used")"
if [[ "$nvme_used" =~ ^[0-9]+$ ]]; then
local rem=$(( 100 - nvme_used ))
if (( rem < 0 )); then rem=0; fi
echo "${nvme_used}|${rem}|nvme_percentage_used|real"
return 0
fi
# ATA (best effort; vendor-specific)
# Prefer explicit "remain" semantics first.
local v
v="$(printf "%s\n" "$smart_out" | awk '$2=="Percent_Lifetime_Remain" {print $10; exit}')"
if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then
echo "$((100 - v))|$v|ata_percent_lifetime_remain|best_effort"
return 0
fi
v="$(printf "%s\n" "$smart_out" | awk '$2=="SSD_Life_Left" {print $10; exit}')"
if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then
echo "$((100 - v))|$v|ata_ssd_life_left|best_effort"
return 0
fi
# Media_Wearout_Indicator is often "remaining", but not guaranteed → best_effort.
v="$(printf "%s\n" "$smart_out" | awk '$2=="Media_Wearout_Indicator" {print $10; exit}')"
if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then
echo "$((100 - v))|$v|ata_media_wearout_indicator_assumed_remaining|best_effort"
return 0
fi
echo "|||unknown|none"
return 1
}
# Enumerate candidate disks: TYPE=disk and ROTA=0 (SSD-ish)
list_disks() {
if [[ -n "$ONLY_DEV" ]]; then
local dn
if dn="$(normalize_to_disk_name "$ONLY_DEV")"; then
echo "$dn"
else
echo "ERROR: --dev '$ONLY_DEV' is not a valid disk/partition path on this host." >&2
exit 2
fi
return
fi
lsblk -dn -o NAME,TYPE,ROTA 2>/dev/null \
| awk '$2=="disk" {print $1, $3}' \
| awk '$2==0 {print $1}'
}
# Filesystem usage for mounted partitions belonging to a disk
fs_usage_for_disk() {
local disk="$1"
lsblk -nr "/dev/$disk" -o NAME,MOUNTPOINT,FSTYPE,SIZE 2>/dev/null \
| awk 'NF>=4 && $2!="-" && $2!="" {print $1 "|" $2 "|" $3 "|" $4}'
}
# Severity logic
severity_from_metrics() {
local health="$1"
local temp="${2:-}"
local pct_used="${3:-}" # wear used (higher is worse) if known
local realloc="${4:-0}"
local pending="${5:-0}"
local offline="${6:-0}"
local nvme_crit_warn="${7:-0}"
local media_err="${8:-0}"
local sev=0
if [[ "$health" == "FAILED" ]]; then sev=2; fi
if [[ "$nvme_crit_warn" =~ ^[0-9]+$ ]] && (( nvme_crit_warn > 0 )); then sev=2; fi
if [[ "$realloc" =~ ^[0-9]+$ ]] && (( realloc > 0 )); then sev=2; fi
if [[ "$pending" =~ ^[0-9]+$ ]] && (( pending > 0 )); then sev=2; fi
if [[ "$offline" =~ ^[0-9]+$ ]] && (( offline > 0 )); then sev=2; fi
if [[ "$media_err" =~ ^[0-9]+$ ]] && (( media_err > 0 )); then sev=2; fi
if [[ "$pct_used" =~ ^[0-9]+$ ]]; then
if (( pct_used >= 90 )); then sev=2
elif (( pct_used >= 70 )) && (( sev < 1 )); then sev=1
fi
fi
if [[ "$temp" =~ ^[0-9]+$ ]]; then
if (( temp >= 70 )); then sev=2
elif (( temp >= 60 )) && (( sev < 1 )); then sev=1
fi
fi
echo "$sev"
}
main_text() {
local overall_sev=0
echo "=== SSD/NVMe Health Report ($(date -Is)) ==="
echo
while read -r d; do
[[ -z "$d" ]] && continue
local dtype model serial size tran rota smart_out
dtype="$(smart_scan_type_for "$d")"
model="$(lsblk -dn -o MODEL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')"
serial="$(lsblk -dn -o SERIAL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')"
size="$(lsblk -dn -o SIZE "/dev/$d" 2>/dev/null | head -n1)"
tran="$(lsblk -dn -o TRAN "/dev/$d" 2>/dev/null | head -n1)"
rota="$(lsblk -dn -o ROTA "/dev/$d" 2>/dev/null | head -n1)"
smart_out="$(smart_run "$d" "$dtype")"
local smart_access
smart_access="$(printf "%s\n" "$smart_out" | smart_access_state)"
local health temp
health="$(printf "%s\n" "$smart_out" | parse_health)"
temp="$(printf "%s\n" "$smart_out" | parse_temp_c)"
# NVMe fields (if present)
local nvme_crit_warn media_err unsafe_shutdowns poh duw dur
nvme_crit_warn="$(printf "%s\n" "$smart_out" | parse_nvme_int "Critical Warning")"
media_err="$(printf "%s\n" "$smart_out" | parse_nvme_int "Media and Data Integrity Errors")"
unsafe_shutdowns="$(printf "%s\n" "$smart_out" | parse_nvme_int "Unsafe Shutdowns")"
poh="$(printf "%s\n" "$smart_out" | parse_nvme_int "Power On Hours")"
duw="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_written)"
dur="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_read)"
# ATA attributes (if present)
local realloc pending offline crc poh_ata
realloc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Reallocated_Sector_Ct")"
pending="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Current_Pending_Sector")"
offline="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Offline_Uncorrectable")"
crc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "UDMA_CRC_Error_Count")"
poh_ata="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Power_On_Hours")"
# Wear probe (preferred for "disk agotarse")
local wear_used wear_rem wear_source wear_conf
IFS="|" read -r wear_used wear_rem wear_source wear_conf < <(wear_probe "$smart_out" || true)
# Can we diagnose wear?
local wear_known=0
[[ "$wear_used" =~ ^[0-9]+$ ]] && wear_known=1
# If SMART is not accessible, we cannot diagnose properly.
local diagnosable=1
if [[ "$smart_access" != "OK" ]]; then
diagnosable=0
elif [[ "$wear_known" != "1" && "$health" == "UNKNOWN" ]]; then
# We got some SMART text but not enough to assess wear nor health.
diagnosable=0
fi
local sev
sev="$(severity_from_metrics "$health" "$temp" "${wear_used:-}" "${realloc:-0}" "${pending:-0}" "${offline:-0}" "${nvme_crit_warn:-0}" "${media_err:-0}")"
# If not diagnosable, force WARNING (so it gets noticed).
if [[ "$diagnosable" == "0" && "$sev" -lt 1 ]]; then
sev=1
fi
local status
if [[ "$sev" == "2" ]]; then
status="CRITICAL"
elif [[ "$diagnosable" == "0" ]]; then
status="INCONCLUSIVE"
elif [[ "$sev" == "1" ]]; then
status="WARNING"
else
status="OK"
fi
(( sev > overall_sev )) && overall_sev="$sev"
echo "Device: /dev/$d ($status)"
echo " Model: ${model:-?} Serial: ${serial:-?} Size: ${size:-?} TRAN: ${tran:-?} ROTA: ${rota:-?} smartctl -d: ${dtype:-auto}"
echo " SMART access: $smart_access SMART health: $health Temp(C): ${temp:-?}"
if [[ "$wear_known" == "1" ]]; then
echo " Wear: used=${wear_used}% remaining=${wear_rem}% source=${wear_source} confidence=${wear_conf}"
else
echo " Wear: (cannot determine) — no usable wear metric exposed (common on some USB bridges / drives)"
fi
if [[ -n "${nvme_crit_warn:-}" || -n "${media_err:-}" || -n "${unsafe_shutdowns:-}" || -n "${poh:-}" ]]; then
echo " NVMe: crit_warn=${nvme_crit_warn:-?} media_err=${media_err:-?} unsafe_shutdowns=${unsafe_shutdowns:-?} power_on_hours=${poh:-?}"
[[ -n "${duw:-}" ]] && echo " NVMe: data_units_written~${duw}"
[[ -n "${dur:-}" ]] && echo " NVMe: data_units_read~${dur}"
fi
if [[ -n "${realloc:-}" || -n "${pending:-}" || -n "${offline:-}" || -n "${crc:-}" ]]; then
echo " ATA: realloc=${realloc:-?} pending=${pending:-?} offline_uncorrectable=${offline:-?} crc_errors=${crc:-?} power_on_hours=${poh_ata:-?}"
fi
if [[ "$diagnosable" == "0" ]]; then
echo " DIAGNOSTIC NOTE: Insufficient SMART data to give a reliable wear/health diagnosis for this device."
fi
echo " Filesystems:"
local fsline foundfs=0
while IFS= read -r fsline; do
[[ -z "$fsline" ]] && continue
foundfs=1
local pname mnt fstype psize
pname="${fsline%%|*}"; fsline="${fsline#*|}"
mnt="${fsline%%|*}"; fsline="${fsline#*|}"
fstype="${fsline%%|*}"; psize="${fsline#*|}"
local dfline
dfline="$(df -P "$mnt" 2>/dev/null | awk 'NR==2{print $2,$3,$4,$5}')"
echo " /dev/$pname mnt=$mnt fstype=$fstype part_size=$psize df(total used avail use%)=($dfline)"
done < <(fs_usage_for_disk "$d" || true)
[[ "$foundfs" == "0" ]] && echo " (no mounted partitions found)"
if [[ "$DETAIL" == "1" ]]; then
echo " --- smartctl raw (trimmed) ---"
printf "%s\n" "$smart_out" | sed -n '1,140p'
echo " --- end ---"
fi
echo
done < <(list_disks)
if [[ "$overall_sev" == "2" ]]; then
echo "Overall: CRITICAL"
exit 2
elif [[ "$overall_sev" == "1" ]]; then
echo "Overall: WARNING/INCONCLUSIVE"
exit 1
else
echo "Overall: OK"
exit 0
fi
}
main_json() {
local overall_sev=0
local first=1
echo "{"
echo " \"timestamp\": \"$(date -Is)\","
echo " \"devices\": ["
while read -r d; do
[[ -z "$d" ]] && continue
local dtype model serial size tran rota smart_out
dtype="$(smart_scan_type_for "$d")"
model="$(lsblk -dn -o MODEL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')"
serial="$(lsblk -dn -o SERIAL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')"
size="$(lsblk -dn -o SIZE "/dev/$d" 2>/dev/null | head -n1)"
tran="$(lsblk -dn -o TRAN "/dev/$d" 2>/dev/null | head -n1)"
rota="$(lsblk -dn -o ROTA "/dev/$d" 2>/dev/null | head -n1)"
smart_out="$(smart_run "$d" "$dtype")"
local smart_access
smart_access="$(printf "%s\n" "$smart_out" | smart_access_state)"
local health temp
health="$(printf "%s\n" "$smart_out" | parse_health)"
temp="$(printf "%s\n" "$smart_out" | parse_temp_c)"
local nvme_crit_warn media_err unsafe_shutdowns poh duw dur
nvme_crit_warn="$(printf "%s\n" "$smart_out" | parse_nvme_int "Critical Warning")"
media_err="$(printf "%s\n" "$smart_out" | parse_nvme_int "Media and Data Integrity Errors")"
unsafe_shutdowns="$(printf "%s\n" "$smart_out" | parse_nvme_int "Unsafe Shutdowns")"
poh="$(printf "%s\n" "$smart_out" | parse_nvme_int "Power On Hours")"
duw="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_written)"
dur="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_read)"
local realloc pending offline crc poh_ata
realloc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Reallocated_Sector_Ct")"
pending="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Current_Pending_Sector")"
offline="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Offline_Uncorrectable")"
crc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "UDMA_CRC_Error_Count")"
poh_ata="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Power_On_Hours")"
local wear_used wear_rem wear_source wear_conf
IFS="|" read -r wear_used wear_rem wear_source wear_conf < <(wear_probe "$smart_out" || true)
local wear_known=0
[[ "$wear_used" =~ ^[0-9]+$ ]] && wear_known=1
local diagnosable=1
if [[ "$smart_access" != "OK" ]]; then
diagnosable=0
elif [[ "$wear_known" != "1" && "$health" == "UNKNOWN" ]]; then
diagnosable=0
fi
local sev
sev="$(severity_from_metrics "$health" "$temp" "${wear_used:-}" "${realloc:-0}" "${pending:-0}" "${offline:-0}" "${nvme_crit_warn:-0}" "${media_err:-0}")"
if [[ "$diagnosable" == "0" && "$sev" -lt 1 ]]; then
sev=1
fi
(( sev > overall_sev )) && overall_sev="$sev"
[[ "$first" == "1" ]] || echo " ,"
first=0
# Filesystems
local fs_items=""
local fsline
while IFS= read -r fsline; do
[[ -z "$fsline" ]] && continue
local pname mnt fstype psize
pname="${fsline%%|*}"; fsline="${fsline#*|}"
mnt="${fsline%%|*}"; fsline="${fsline#*|}"
fstype="${fsline%%|*}"; psize="${fsline#*|}"
local df_total df_used df_avail df_usep
read -r df_total df_used df_avail df_usep < <(df -P "$mnt" 2>/dev/null | awk 'NR==2{print $2,$3,$4,$5}')
fs_items+="{\"partition\":\"/dev/$pname\",\"mount\":$(json_escape "$mnt"),\"fstype\":$(json_escape "$fstype"),\"part_size\":$(json_escape "$psize"),\"df\":{\"total\":$(json_escape "${df_total:-}"),\"used\":$(json_escape "${df_used:-}"),\"avail\":$(json_escape "${df_avail:-}"),\"use_pct\":$(json_escape "${df_usep:-}")}},"
done < <(fs_usage_for_disk "$d" || true)
local fs_json="[]"
[[ -n "$fs_items" ]] && fs_json="[${fs_items%,}]"
cat <<EOF
{
"device": "/dev/$d",
"model": $(json_escape "${model:-}"),
"serial": $(json_escape "${serial:-}"),
"size": $(json_escape "${size:-}"),
"tran": $(json_escape "${tran:-}"),
"rota": $(json_escape "${rota:-}"),
"smartctl_device_type": $(json_escape "${dtype:-auto}"),
"smart_access": $(json_escape "$smart_access"),
"health": $(json_escape "$health"),
"temp_c": $(json_escape "${temp:-}"),
"wear": {
"used_pct": $(json_escape "${wear_used:-}"),
"remaining_pct": $(json_escape "${wear_rem:-}"),
"source": $(json_escape "${wear_source:-}"),
"confidence": $(json_escape "${wear_conf:-}"),
"known": $wear_known
},
"nvme": {
"critical_warning": $(json_escape "${nvme_crit_warn:-}"),
"media_errors": $(json_escape "${media_err:-}"),
"unsafe_shutdowns": $(json_escape "${unsafe_shutdowns:-}"),
"power_on_hours": $(json_escape "${poh:-}"),
"data_units_written": $(json_escape "${duw:-}"),
"data_units_read": $(json_escape "${dur:-}")
},
"ata": {
"reallocated_sectors": $(json_escape "${realloc:-}"),
"pending_sectors": $(json_escape "${pending:-}"),
"offline_uncorrectable": $(json_escape "${offline:-}"),
"crc_errors": $(json_escape "${crc:-}"),
"power_on_hours": $(json_escape "${poh_ata:-}")
},
"diagnosable": $diagnosable,
"severity": $sev,
"filesystems": $fs_json
}
EOF
done < <(list_disks)
echo " ],"
local overall="OK"
[[ "$overall_sev" == "1" ]] && overall="WARNING/INCONCLUSIVE"
[[ "$overall_sev" == "2" ]] && overall="CRITICAL"
echo " \"overall\": \"$overall\","
echo " \"overall_severity\": $overall_sev"
echo "}"
exit "$overall_sev"
}
if [[ "$MODE" == "json" ]]; then
main_json
else
main_text
fi