diff --git a/ssd-health.sh b/ssd-health.sh new file mode 100644 index 0000000..3534ea0 --- /dev/null +++ b/ssd-health.sh @@ -0,0 +1,573 @@ +#!/usr/bin/env bash +# ssd-health.sh — SSD/NVMe health + wear + filesystem usage summary +# Exit codes: 0=OK, 1=WARNING/INCONCLUSIVE, 2=CRITICAL + +set -euo pipefail + +MODE="text" # text|json +DETAIL=0 +ONLY_DEV="" # user requested device (may be /dev/sda, sda, /dev/sda1, etc.) +TIMEOUT_SECS=8 + +usage() { + cat <<'EOF' +Usage: + ssd-health.sh [--json] [--detail] [--dev ] [--timeout ] + +Notes: + --dev accepts: sda | /dev/sda | nvme0n1 | /dev/nvme0n1 | /dev/sda1 (partition -> parent disk) + +Examples: + sudo ./ssd-health.sh + sudo ./ssd-health.sh --json + sudo ./ssd-health.sh --dev /dev/sda + sudo ./ssd-health.sh --dev /dev/nvme0n1 +EOF +} + +need_cmd() { command -v "$1" >/dev/null 2>&1; } + +while [[ $# -gt 0 ]]; do + case "$1" in + --json) MODE="json"; shift ;; + --detail) DETAIL=1; shift ;; + --dev|--disk) ONLY_DEV="${2:-}"; shift 2 ;; + --timeout) TIMEOUT_SECS="${2:-8}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown arg: $1" >&2; usage; exit 2 ;; + esac +done + +if ! need_cmd lsblk || ! need_cmd awk || ! need_cmd sed || ! need_cmd grep; then + echo "Missing required base commands (lsblk/awk/sed/grep)." >&2 + exit 2 +fi +if ! need_cmd smartctl; then + echo "Missing smartctl. Install smartmontools:" >&2 + echo " sudo apt-get update && sudo apt-get install -y smartmontools" >&2 + exit 2 +fi + +# Normalize user input into a disk name (e.g. sda, nvme0n1). +# Accepts /dev/sda, sda, /dev/sda1 (partition), etc. +normalize_to_disk_name() { + local arg="$1" + [[ -z "$arg" ]] && return 1 + + # Strip /dev/ prefix if present. + arg="${arg#/dev/}" + + # If it's a partition, map to parent disk using lsblk. + # Example: sda1 -> sda, nvme0n1p2 -> nvme0n1 + if lsblk -no TYPE "/dev/$arg" 2>/dev/null | grep -qx "part"; then + local parent + parent="$(lsblk -no PKNAME "/dev/$arg" 2>/dev/null | head -n1 || true)" + [[ -n "$parent" ]] && { echo "$parent"; return 0; } + fi + + # If it's already a disk, return it. + if lsblk -no TYPE "/dev/$arg" 2>/dev/null | grep -qx "disk"; then + echo "$arg" + return 0 + fi + + # Unknown. + return 1 +} + +# Determine SMART device type (if any) via smartctl scan-open. +smart_scan_type_for() { + local dev="/dev/$1" + local line + line="$(smartctl --scan-open 2>/dev/null | awk -v d="$dev" '$1==d {print; exit}')" || true + if [[ -n "$line" ]]; then + echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="-d") {print $(i+1); exit}}' + else + echo "" + fi +} + +smart_run() { + local name="$1" + local dev="/dev/$name" + local dtype="$2" + local args=(-a -H -i -A -n standby) + + if need_cmd timeout; then + if [[ -n "$dtype" ]]; then + timeout "${TIMEOUT_SECS}s" smartctl "${args[@]}" -d "$dtype" "$dev" 2>/dev/null || true + else + timeout "${TIMEOUT_SECS}s" smartctl "${args[@]}" "$dev" 2>/dev/null || true + fi + else + if [[ -n "$dtype" ]]; then + smartctl "${args[@]}" -d "$dtype" "$dev" 2>/dev/null || true + else + smartctl "${args[@]}" "$dev" 2>/dev/null || true + fi + fi +} + +# Detect if smartctl output is usable (bridge might block SMART, timeout, etc.) +smart_access_state() { + # Heuristics: if we see the START OF INFORMATION SECTION or NVMe SMART section, it's likely usable. + awk ' + BEGIN {found=0} + /=== START OF INFORMATION SECTION ===/ {print "OK"; found=1; exit} + /SMART\/Health Information/ {print "OK"; found=1; exit} + /NVMe Log 0x02/ {print "OK"; found=1; exit} + /Device does not support SMART/ {print "UNSUPPORTED"; found=1; exit} + /Unknown USB bridge/ {print "UNSUPPORTED"; found=1; exit} + END { if (!found) print "UNAVAILABLE" } + ' +} + +parse_health() { + awk ' + /overall-health self-assessment test result:/ { + if ($0 ~ /PASSED|OK/) print "PASSED"; else print "FAILED"; found=1; exit + } + /SMART Health Status:/ { + if ($0 ~ /OK/) print "PASSED"; else print "FAILED"; found=1; exit + } + END { if (!found) print "UNKNOWN" } + ' +} + +parse_temp_c() { + awk ' + /Temperature:/ && $0 ~ /Celsius/ { for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) {print $i; exit} } + /Current Drive Temperature:/ { for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) {print $i; exit} } + /Temperature_Celsius/ {print $10; exit} + /Temperature_Internal/ {print $10; exit} + END { } + ' +} + +parse_ata_attr_raw() { + local attr="$1" + awk -v a="$attr" '$2==a {print $10; exit}' +} + +parse_nvme_int() { + local label="$1" + awk -v l="$label" ' + index($0,l":")==1 { + gsub(/[^0-9]/,""); + if ($0=="") print ""; else print $0; + exit + } + ' +} + +parse_nvme_bytes_units_written() { + awk ' + index($0,"Data Units Written:")==1 { + match($0, /\[[0-9.,]+[[:space:]]*[A-Za-z]+\]/); + if (RSTART>0) { print substr($0, RSTART+1, RLENGTH-2); } else print ""; + exit + } + ' +} + +parse_nvme_bytes_units_read() { + awk ' + index($0,"Data Units Read:")==1 { + match($0, /\[[0-9.,]+[[:space:]]*[A-Za-z]+\]/); + if (RSTART>0) { print substr($0, RSTART+1, RLENGTH-2); } else print ""; + exit + } + ' +} + +json_escape() { + python3 - <<'PY' "$1" +import json,sys +print(json.dumps(sys.argv[1])) +PY +} + +# Determine wear percentage if available. +# Returns: "used_pct|remaining_pct|source|confidence" +# - NVMe: used_pct is Percentage Used (0..100) -> REAL +# - ATA: tries known life attributes; if found and looks like 0..100, treated as "remaining" (BEST EFFORT) +wear_probe() { + local smart_out="$1" + + # NVMe (reliable) + local nvme_used + nvme_used="$(printf "%s\n" "$smart_out" | parse_nvme_int "Percentage Used")" + if [[ "$nvme_used" =~ ^[0-9]+$ ]]; then + local rem=$(( 100 - nvme_used )) + if (( rem < 0 )); then rem=0; fi + echo "${nvme_used}|${rem}|nvme_percentage_used|real" + return 0 + fi + + # ATA (best effort; vendor-specific) + # Prefer explicit "remain" semantics first. + local v + v="$(printf "%s\n" "$smart_out" | awk '$2=="Percent_Lifetime_Remain" {print $10; exit}')" + if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then + echo "$((100 - v))|$v|ata_percent_lifetime_remain|best_effort" + return 0 + fi + + v="$(printf "%s\n" "$smart_out" | awk '$2=="SSD_Life_Left" {print $10; exit}')" + if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then + echo "$((100 - v))|$v|ata_ssd_life_left|best_effort" + return 0 + fi + + # Media_Wearout_Indicator is often "remaining", but not guaranteed → best_effort. + v="$(printf "%s\n" "$smart_out" | awk '$2=="Media_Wearout_Indicator" {print $10; exit}')" + if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then + echo "$((100 - v))|$v|ata_media_wearout_indicator_assumed_remaining|best_effort" + return 0 + fi + + echo "|||unknown|none" + return 1 +} + +# Enumerate candidate disks: TYPE=disk and ROTA=0 (SSD-ish) +list_disks() { + if [[ -n "$ONLY_DEV" ]]; then + local dn + if dn="$(normalize_to_disk_name "$ONLY_DEV")"; then + echo "$dn" + else + echo "ERROR: --dev '$ONLY_DEV' is not a valid disk/partition path on this host." >&2 + exit 2 + fi + return + fi + lsblk -dn -o NAME,TYPE,ROTA 2>/dev/null \ + | awk '$2=="disk" {print $1, $3}' \ + | awk '$2==0 {print $1}' +} + +# Filesystem usage for mounted partitions belonging to a disk +fs_usage_for_disk() { + local disk="$1" + lsblk -nr "/dev/$disk" -o NAME,MOUNTPOINT,FSTYPE,SIZE 2>/dev/null \ + | awk 'NF>=4 && $2!="-" && $2!="" {print $1 "|" $2 "|" $3 "|" $4}' +} + +# Severity logic +severity_from_metrics() { + local health="$1" + local temp="${2:-}" + local pct_used="${3:-}" # wear used (higher is worse) if known + local realloc="${4:-0}" + local pending="${5:-0}" + local offline="${6:-0}" + local nvme_crit_warn="${7:-0}" + local media_err="${8:-0}" + + local sev=0 + + if [[ "$health" == "FAILED" ]]; then sev=2; fi + if [[ "$nvme_crit_warn" =~ ^[0-9]+$ ]] && (( nvme_crit_warn > 0 )); then sev=2; fi + if [[ "$realloc" =~ ^[0-9]+$ ]] && (( realloc > 0 )); then sev=2; fi + if [[ "$pending" =~ ^[0-9]+$ ]] && (( pending > 0 )); then sev=2; fi + if [[ "$offline" =~ ^[0-9]+$ ]] && (( offline > 0 )); then sev=2; fi + if [[ "$media_err" =~ ^[0-9]+$ ]] && (( media_err > 0 )); then sev=2; fi + + if [[ "$pct_used" =~ ^[0-9]+$ ]]; then + if (( pct_used >= 90 )); then sev=2 + elif (( pct_used >= 70 )) && (( sev < 1 )); then sev=1 + fi + fi + + if [[ "$temp" =~ ^[0-9]+$ ]]; then + if (( temp >= 70 )); then sev=2 + elif (( temp >= 60 )) && (( sev < 1 )); then sev=1 + fi + fi + + echo "$sev" +} + +main_text() { + local overall_sev=0 + + echo "=== SSD/NVMe Health Report ($(date -Is)) ===" + echo + + while read -r d; do + [[ -z "$d" ]] && continue + + local dtype model serial size tran rota smart_out + dtype="$(smart_scan_type_for "$d")" + model="$(lsblk -dn -o MODEL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" + serial="$(lsblk -dn -o SERIAL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" + size="$(lsblk -dn -o SIZE "/dev/$d" 2>/dev/null | head -n1)" + tran="$(lsblk -dn -o TRAN "/dev/$d" 2>/dev/null | head -n1)" + rota="$(lsblk -dn -o ROTA "/dev/$d" 2>/dev/null | head -n1)" + + smart_out="$(smart_run "$d" "$dtype")" + local smart_access + smart_access="$(printf "%s\n" "$smart_out" | smart_access_state)" + + local health temp + health="$(printf "%s\n" "$smart_out" | parse_health)" + temp="$(printf "%s\n" "$smart_out" | parse_temp_c)" + + # NVMe fields (if present) + local nvme_crit_warn media_err unsafe_shutdowns poh duw dur + nvme_crit_warn="$(printf "%s\n" "$smart_out" | parse_nvme_int "Critical Warning")" + media_err="$(printf "%s\n" "$smart_out" | parse_nvme_int "Media and Data Integrity Errors")" + unsafe_shutdowns="$(printf "%s\n" "$smart_out" | parse_nvme_int "Unsafe Shutdowns")" + poh="$(printf "%s\n" "$smart_out" | parse_nvme_int "Power On Hours")" + duw="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_written)" + dur="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_read)" + + # ATA attributes (if present) + local realloc pending offline crc poh_ata + realloc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Reallocated_Sector_Ct")" + pending="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Current_Pending_Sector")" + offline="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Offline_Uncorrectable")" + crc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "UDMA_CRC_Error_Count")" + poh_ata="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Power_On_Hours")" + + # Wear probe (preferred for "disk agotarse") + local wear_used wear_rem wear_source wear_conf + IFS="|" read -r wear_used wear_rem wear_source wear_conf < <(wear_probe "$smart_out" || true) + + # Can we diagnose wear? + local wear_known=0 + [[ "$wear_used" =~ ^[0-9]+$ ]] && wear_known=1 + + # If SMART is not accessible, we cannot diagnose properly. + local diagnosable=1 + if [[ "$smart_access" != "OK" ]]; then + diagnosable=0 + elif [[ "$wear_known" != "1" && "$health" == "UNKNOWN" ]]; then + # We got some SMART text but not enough to assess wear nor health. + diagnosable=0 + fi + + local sev + sev="$(severity_from_metrics "$health" "$temp" "${wear_used:-}" "${realloc:-0}" "${pending:-0}" "${offline:-0}" "${nvme_crit_warn:-0}" "${media_err:-0}")" + + # If not diagnosable, force WARNING (so it gets noticed). + if [[ "$diagnosable" == "0" && "$sev" -lt 1 ]]; then + sev=1 + fi + + local status + if [[ "$sev" == "2" ]]; then + status="CRITICAL" + elif [[ "$diagnosable" == "0" ]]; then + status="INCONCLUSIVE" + elif [[ "$sev" == "1" ]]; then + status="WARNING" + else + status="OK" + fi + + (( sev > overall_sev )) && overall_sev="$sev" + + echo "Device: /dev/$d ($status)" + echo " Model: ${model:-?} Serial: ${serial:-?} Size: ${size:-?} TRAN: ${tran:-?} ROTA: ${rota:-?} smartctl -d: ${dtype:-auto}" + echo " SMART access: $smart_access SMART health: $health Temp(C): ${temp:-?}" + + if [[ "$wear_known" == "1" ]]; then + echo " Wear: used=${wear_used}% remaining=${wear_rem}% source=${wear_source} confidence=${wear_conf}" + else + echo " Wear: (cannot determine) — no usable wear metric exposed (common on some USB bridges / drives)" + fi + + if [[ -n "${nvme_crit_warn:-}" || -n "${media_err:-}" || -n "${unsafe_shutdowns:-}" || -n "${poh:-}" ]]; then + echo " NVMe: crit_warn=${nvme_crit_warn:-?} media_err=${media_err:-?} unsafe_shutdowns=${unsafe_shutdowns:-?} power_on_hours=${poh:-?}" + [[ -n "${duw:-}" ]] && echo " NVMe: data_units_written~${duw}" + [[ -n "${dur:-}" ]] && echo " NVMe: data_units_read~${dur}" + fi + + if [[ -n "${realloc:-}" || -n "${pending:-}" || -n "${offline:-}" || -n "${crc:-}" ]]; then + echo " ATA: realloc=${realloc:-?} pending=${pending:-?} offline_uncorrectable=${offline:-?} crc_errors=${crc:-?} power_on_hours=${poh_ata:-?}" + fi + + if [[ "$diagnosable" == "0" ]]; then + echo " DIAGNOSTIC NOTE: Insufficient SMART data to give a reliable wear/health diagnosis for this device." + fi + + echo " Filesystems:" + local fsline foundfs=0 + while IFS= read -r fsline; do + [[ -z "$fsline" ]] && continue + foundfs=1 + local pname mnt fstype psize + pname="${fsline%%|*}"; fsline="${fsline#*|}" + mnt="${fsline%%|*}"; fsline="${fsline#*|}" + fstype="${fsline%%|*}"; psize="${fsline#*|}" + local dfline + dfline="$(df -P "$mnt" 2>/dev/null | awk 'NR==2{print $2,$3,$4,$5}')" + echo " /dev/$pname mnt=$mnt fstype=$fstype part_size=$psize df(total used avail use%)=($dfline)" + done < <(fs_usage_for_disk "$d" || true) + [[ "$foundfs" == "0" ]] && echo " (no mounted partitions found)" + + if [[ "$DETAIL" == "1" ]]; then + echo " --- smartctl raw (trimmed) ---" + printf "%s\n" "$smart_out" | sed -n '1,140p' + echo " --- end ---" + fi + + echo + done < <(list_disks) + + if [[ "$overall_sev" == "2" ]]; then + echo "Overall: CRITICAL" + exit 2 + elif [[ "$overall_sev" == "1" ]]; then + echo "Overall: WARNING/INCONCLUSIVE" + exit 1 + else + echo "Overall: OK" + exit 0 + fi +} + +main_json() { + local overall_sev=0 + local first=1 + + echo "{" + echo " \"timestamp\": \"$(date -Is)\"," + echo " \"devices\": [" + + while read -r d; do + [[ -z "$d" ]] && continue + + local dtype model serial size tran rota smart_out + dtype="$(smart_scan_type_for "$d")" + model="$(lsblk -dn -o MODEL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" + serial="$(lsblk -dn -o SERIAL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" + size="$(lsblk -dn -o SIZE "/dev/$d" 2>/dev/null | head -n1)" + tran="$(lsblk -dn -o TRAN "/dev/$d" 2>/dev/null | head -n1)" + rota="$(lsblk -dn -o ROTA "/dev/$d" 2>/dev/null | head -n1)" + + smart_out="$(smart_run "$d" "$dtype")" + local smart_access + smart_access="$(printf "%s\n" "$smart_out" | smart_access_state)" + + local health temp + health="$(printf "%s\n" "$smart_out" | parse_health)" + temp="$(printf "%s\n" "$smart_out" | parse_temp_c)" + + local nvme_crit_warn media_err unsafe_shutdowns poh duw dur + nvme_crit_warn="$(printf "%s\n" "$smart_out" | parse_nvme_int "Critical Warning")" + media_err="$(printf "%s\n" "$smart_out" | parse_nvme_int "Media and Data Integrity Errors")" + unsafe_shutdowns="$(printf "%s\n" "$smart_out" | parse_nvme_int "Unsafe Shutdowns")" + poh="$(printf "%s\n" "$smart_out" | parse_nvme_int "Power On Hours")" + duw="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_written)" + dur="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_read)" + + local realloc pending offline crc poh_ata + realloc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Reallocated_Sector_Ct")" + pending="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Current_Pending_Sector")" + offline="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Offline_Uncorrectable")" + crc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "UDMA_CRC_Error_Count")" + poh_ata="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Power_On_Hours")" + + local wear_used wear_rem wear_source wear_conf + IFS="|" read -r wear_used wear_rem wear_source wear_conf < <(wear_probe "$smart_out" || true) + + local wear_known=0 + [[ "$wear_used" =~ ^[0-9]+$ ]] && wear_known=1 + + local diagnosable=1 + if [[ "$smart_access" != "OK" ]]; then + diagnosable=0 + elif [[ "$wear_known" != "1" && "$health" == "UNKNOWN" ]]; then + diagnosable=0 + fi + + local sev + sev="$(severity_from_metrics "$health" "$temp" "${wear_used:-}" "${realloc:-0}" "${pending:-0}" "${offline:-0}" "${nvme_crit_warn:-0}" "${media_err:-0}")" + if [[ "$diagnosable" == "0" && "$sev" -lt 1 ]]; then + sev=1 + fi + + (( sev > overall_sev )) && overall_sev="$sev" + + [[ "$first" == "1" ]] || echo " ," + first=0 + + # Filesystems + local fs_items="" + local fsline + while IFS= read -r fsline; do + [[ -z "$fsline" ]] && continue + local pname mnt fstype psize + pname="${fsline%%|*}"; fsline="${fsline#*|}" + mnt="${fsline%%|*}"; fsline="${fsline#*|}" + fstype="${fsline%%|*}"; psize="${fsline#*|}" + local df_total df_used df_avail df_usep + read -r df_total df_used df_avail df_usep < <(df -P "$mnt" 2>/dev/null | awk 'NR==2{print $2,$3,$4,$5}') + fs_items+="{\"partition\":\"/dev/$pname\",\"mount\":$(json_escape "$mnt"),\"fstype\":$(json_escape "$fstype"),\"part_size\":$(json_escape "$psize"),\"df\":{\"total\":$(json_escape "${df_total:-}"),\"used\":$(json_escape "${df_used:-}"),\"avail\":$(json_escape "${df_avail:-}"),\"use_pct\":$(json_escape "${df_usep:-}")}}," + done < <(fs_usage_for_disk "$d" || true) + local fs_json="[]" + [[ -n "$fs_items" ]] && fs_json="[${fs_items%,}]" + + cat <