#!/usr/bin/env bash # ssd-health.sh — SSD/NVMe health + wear + filesystem usage summary # Exit codes: 0=OK, 1=WARNING/INCONCLUSIVE, 2=CRITICAL set -euo pipefail MODE="text" # text|json DETAIL=0 ONLY_DEV="" # user requested device (may be /dev/sda, sda, /dev/sda1, etc.) TIMEOUT_SECS=8 usage() { cat <<'EOF' Usage: ssd-health.sh [--json] [--detail] [--dev ] [--timeout ] Notes: --dev accepts: sda | /dev/sda | nvme0n1 | /dev/nvme0n1 | /dev/sda1 (partition -> parent disk) Examples: sudo ./ssd-health.sh sudo ./ssd-health.sh --json sudo ./ssd-health.sh --dev /dev/sda sudo ./ssd-health.sh --dev /dev/nvme0n1 EOF } need_cmd() { command -v "$1" >/dev/null 2>&1; } while [[ $# -gt 0 ]]; do case "$1" in --json) MODE="json"; shift ;; --detail) DETAIL=1; shift ;; --dev|--disk) ONLY_DEV="${2:-}"; shift 2 ;; --timeout) TIMEOUT_SECS="${2:-8}"; shift 2 ;; -h|--help) usage; exit 0 ;; *) echo "Unknown arg: $1" >&2; usage; exit 2 ;; esac done if ! need_cmd lsblk || ! need_cmd awk || ! need_cmd sed || ! need_cmd grep; then echo "Missing required base commands (lsblk/awk/sed/grep)." >&2 exit 2 fi if ! need_cmd smartctl; then echo "Missing smartctl. Install smartmontools:" >&2 echo " sudo apt-get update && sudo apt-get install -y smartmontools" >&2 exit 2 fi # Normalize user input into a disk name (e.g. sda, nvme0n1). # Accepts /dev/sda, sda, /dev/sda1 (partition), etc. normalize_to_disk_name() { local arg="$1" [[ -z "$arg" ]] && return 1 # Strip /dev/ prefix if present. arg="${arg#/dev/}" # If it's a partition, map to parent disk using lsblk. # Example: sda1 -> sda, nvme0n1p2 -> nvme0n1 if lsblk -no TYPE "/dev/$arg" 2>/dev/null | grep -qx "part"; then local parent parent="$(lsblk -no PKNAME "/dev/$arg" 2>/dev/null | head -n1 || true)" [[ -n "$parent" ]] && { echo "$parent"; return 0; } fi # If it's already a disk, return it. if lsblk -no TYPE "/dev/$arg" 2>/dev/null | grep -qx "disk"; then echo "$arg" return 0 fi # Unknown. return 1 } # Determine SMART device type (if any) via smartctl scan-open. smart_scan_type_for() { local dev="/dev/$1" local line line="$(smartctl --scan-open 2>/dev/null | awk -v d="$dev" '$1==d {print; exit}')" || true if [[ -n "$line" ]]; then echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="-d") {print $(i+1); exit}}' else echo "" fi } smart_run() { local name="$1" local dev="/dev/$name" local dtype="$2" local args=(-a -H -i -A -n standby) if need_cmd timeout; then if [[ -n "$dtype" ]]; then timeout "${TIMEOUT_SECS}s" smartctl "${args[@]}" -d "$dtype" "$dev" 2>/dev/null || true else timeout "${TIMEOUT_SECS}s" smartctl "${args[@]}" "$dev" 2>/dev/null || true fi else if [[ -n "$dtype" ]]; then smartctl "${args[@]}" -d "$dtype" "$dev" 2>/dev/null || true else smartctl "${args[@]}" "$dev" 2>/dev/null || true fi fi } # Detect if smartctl output is usable (bridge might block SMART, timeout, etc.) smart_access_state() { # Heuristics: if we see the START OF INFORMATION SECTION or NVMe SMART section, it's likely usable. awk ' BEGIN {found=0} /=== START OF INFORMATION SECTION ===/ {print "OK"; found=1; exit} /SMART\/Health Information/ {print "OK"; found=1; exit} /NVMe Log 0x02/ {print "OK"; found=1; exit} /Device does not support SMART/ {print "UNSUPPORTED"; found=1; exit} /Unknown USB bridge/ {print "UNSUPPORTED"; found=1; exit} END { if (!found) print "UNAVAILABLE" } ' } parse_health() { awk ' /overall-health self-assessment test result:/ { if ($0 ~ /PASSED|OK/) print "PASSED"; else print "FAILED"; found=1; exit } /SMART Health Status:/ { if ($0 ~ /OK/) print "PASSED"; else print "FAILED"; found=1; exit } END { if (!found) print "UNKNOWN" } ' } parse_temp_c() { awk ' /Temperature:/ && $0 ~ /Celsius/ { for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) {print $i; exit} } /Current Drive Temperature:/ { for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) {print $i; exit} } /Temperature_Celsius/ {print $10; exit} /Temperature_Internal/ {print $10; exit} END { } ' } parse_ata_attr_raw() { local attr="$1" awk -v a="$attr" '$2==a {print $10; exit}' } parse_nvme_int() { local label="$1" awk -v l="$label" ' index($0,l":")==1 { gsub(/[^0-9]/,""); if ($0=="") print ""; else print $0; exit } ' } parse_nvme_bytes_units_written() { awk ' index($0,"Data Units Written:")==1 { match($0, /\[[0-9.,]+[[:space:]]*[A-Za-z]+\]/); if (RSTART>0) { print substr($0, RSTART+1, RLENGTH-2); } else print ""; exit } ' } parse_nvme_bytes_units_read() { awk ' index($0,"Data Units Read:")==1 { match($0, /\[[0-9.,]+[[:space:]]*[A-Za-z]+\]/); if (RSTART>0) { print substr($0, RSTART+1, RLENGTH-2); } else print ""; exit } ' } json_escape() { python3 - <<'PY' "$1" import json,sys print(json.dumps(sys.argv[1])) PY } # Determine wear percentage if available. # Returns: "used_pct|remaining_pct|source|confidence" # - NVMe: used_pct is Percentage Used (0..100) -> REAL # - ATA: tries known life attributes; if found and looks like 0..100, treated as "remaining" (BEST EFFORT) wear_probe() { local smart_out="$1" # NVMe (reliable) local nvme_used nvme_used="$(printf "%s\n" "$smart_out" | parse_nvme_int "Percentage Used")" if [[ "$nvme_used" =~ ^[0-9]+$ ]]; then local rem=$(( 100 - nvme_used )) if (( rem < 0 )); then rem=0; fi echo "${nvme_used}|${rem}|nvme_percentage_used|real" return 0 fi # ATA (best effort; vendor-specific) # Prefer explicit "remain" semantics first. local v v="$(printf "%s\n" "$smart_out" | awk '$2=="Percent_Lifetime_Remain" {print $10; exit}')" if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then echo "$((100 - v))|$v|ata_percent_lifetime_remain|best_effort" return 0 fi v="$(printf "%s\n" "$smart_out" | awk '$2=="SSD_Life_Left" {print $10; exit}')" if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then echo "$((100 - v))|$v|ata_ssd_life_left|best_effort" return 0 fi # Media_Wearout_Indicator is often "remaining", but not guaranteed → best_effort. v="$(printf "%s\n" "$smart_out" | awk '$2=="Media_Wearout_Indicator" {print $10; exit}')" if [[ "$v" =~ ^[0-9]+$ ]] && (( v>=0 && v<=100 )); then echo "$((100 - v))|$v|ata_media_wearout_indicator_assumed_remaining|best_effort" return 0 fi echo "|||unknown|none" return 1 } # Enumerate candidate disks: TYPE=disk and ROTA=0 (SSD-ish) list_disks() { if [[ -n "$ONLY_DEV" ]]; then local dn if dn="$(normalize_to_disk_name "$ONLY_DEV")"; then echo "$dn" else echo "ERROR: --dev '$ONLY_DEV' is not a valid disk/partition path on this host." >&2 exit 2 fi return fi lsblk -dn -o NAME,TYPE,ROTA 2>/dev/null \ | awk '$2=="disk" {print $1, $3}' \ | awk '$2==0 {print $1}' } # Filesystem usage for mounted partitions belonging to a disk fs_usage_for_disk() { local disk="$1" lsblk -nr "/dev/$disk" -o NAME,MOUNTPOINT,FSTYPE,SIZE 2>/dev/null \ | awk 'NF>=4 && $2!="-" && $2!="" {print $1 "|" $2 "|" $3 "|" $4}' } # Severity logic severity_from_metrics() { local health="$1" local temp="${2:-}" local pct_used="${3:-}" # wear used (higher is worse) if known local realloc="${4:-0}" local pending="${5:-0}" local offline="${6:-0}" local nvme_crit_warn="${7:-0}" local media_err="${8:-0}" local sev=0 if [[ "$health" == "FAILED" ]]; then sev=2; fi if [[ "$nvme_crit_warn" =~ ^[0-9]+$ ]] && (( nvme_crit_warn > 0 )); then sev=2; fi if [[ "$realloc" =~ ^[0-9]+$ ]] && (( realloc > 0 )); then sev=2; fi if [[ "$pending" =~ ^[0-9]+$ ]] && (( pending > 0 )); then sev=2; fi if [[ "$offline" =~ ^[0-9]+$ ]] && (( offline > 0 )); then sev=2; fi if [[ "$media_err" =~ ^[0-9]+$ ]] && (( media_err > 0 )); then sev=2; fi if [[ "$pct_used" =~ ^[0-9]+$ ]]; then if (( pct_used >= 90 )); then sev=2 elif (( pct_used >= 70 )) && (( sev < 1 )); then sev=1 fi fi if [[ "$temp" =~ ^[0-9]+$ ]]; then if (( temp >= 70 )); then sev=2 elif (( temp >= 60 )) && (( sev < 1 )); then sev=1 fi fi echo "$sev" } main_text() { local overall_sev=0 echo "=== SSD/NVMe Health Report ($(date -Is)) ===" echo while read -r d; do [[ -z "$d" ]] && continue local dtype model serial size tran rota smart_out dtype="$(smart_scan_type_for "$d")" model="$(lsblk -dn -o MODEL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" serial="$(lsblk -dn -o SERIAL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" size="$(lsblk -dn -o SIZE "/dev/$d" 2>/dev/null | head -n1)" tran="$(lsblk -dn -o TRAN "/dev/$d" 2>/dev/null | head -n1)" rota="$(lsblk -dn -o ROTA "/dev/$d" 2>/dev/null | head -n1)" smart_out="$(smart_run "$d" "$dtype")" local smart_access smart_access="$(printf "%s\n" "$smart_out" | smart_access_state)" local health temp health="$(printf "%s\n" "$smart_out" | parse_health)" temp="$(printf "%s\n" "$smart_out" | parse_temp_c)" # NVMe fields (if present) local nvme_crit_warn media_err unsafe_shutdowns poh duw dur nvme_crit_warn="$(printf "%s\n" "$smart_out" | parse_nvme_int "Critical Warning")" media_err="$(printf "%s\n" "$smart_out" | parse_nvme_int "Media and Data Integrity Errors")" unsafe_shutdowns="$(printf "%s\n" "$smart_out" | parse_nvme_int "Unsafe Shutdowns")" poh="$(printf "%s\n" "$smart_out" | parse_nvme_int "Power On Hours")" duw="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_written)" dur="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_read)" # ATA attributes (if present) local realloc pending offline crc poh_ata realloc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Reallocated_Sector_Ct")" pending="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Current_Pending_Sector")" offline="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Offline_Uncorrectable")" crc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "UDMA_CRC_Error_Count")" poh_ata="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Power_On_Hours")" # Wear probe (preferred for "disk agotarse") local wear_used wear_rem wear_source wear_conf IFS="|" read -r wear_used wear_rem wear_source wear_conf < <(wear_probe "$smart_out" || true) # Can we diagnose wear? local wear_known=0 [[ "$wear_used" =~ ^[0-9]+$ ]] && wear_known=1 # If SMART is not accessible, we cannot diagnose properly. local diagnosable=1 if [[ "$smart_access" != "OK" ]]; then diagnosable=0 elif [[ "$wear_known" != "1" && "$health" == "UNKNOWN" ]]; then # We got some SMART text but not enough to assess wear nor health. diagnosable=0 fi local sev sev="$(severity_from_metrics "$health" "$temp" "${wear_used:-}" "${realloc:-0}" "${pending:-0}" "${offline:-0}" "${nvme_crit_warn:-0}" "${media_err:-0}")" # If not diagnosable, force WARNING (so it gets noticed). if [[ "$diagnosable" == "0" && "$sev" -lt 1 ]]; then sev=1 fi local status if [[ "$sev" == "2" ]]; then status="CRITICAL" elif [[ "$diagnosable" == "0" ]]; then status="INCONCLUSIVE" elif [[ "$sev" == "1" ]]; then status="WARNING" else status="OK" fi (( sev > overall_sev )) && overall_sev="$sev" echo "Device: /dev/$d ($status)" echo " Model: ${model:-?} Serial: ${serial:-?} Size: ${size:-?} TRAN: ${tran:-?} ROTA: ${rota:-?} smartctl -d: ${dtype:-auto}" echo " SMART access: $smart_access SMART health: $health Temp(C): ${temp:-?}" if [[ "$wear_known" == "1" ]]; then echo " Wear: used=${wear_used}% remaining=${wear_rem}% source=${wear_source} confidence=${wear_conf}" else echo " Wear: (cannot determine) — no usable wear metric exposed (common on some USB bridges / drives)" fi if [[ -n "${nvme_crit_warn:-}" || -n "${media_err:-}" || -n "${unsafe_shutdowns:-}" || -n "${poh:-}" ]]; then echo " NVMe: crit_warn=${nvme_crit_warn:-?} media_err=${media_err:-?} unsafe_shutdowns=${unsafe_shutdowns:-?} power_on_hours=${poh:-?}" [[ -n "${duw:-}" ]] && echo " NVMe: data_units_written~${duw}" [[ -n "${dur:-}" ]] && echo " NVMe: data_units_read~${dur}" fi if [[ -n "${realloc:-}" || -n "${pending:-}" || -n "${offline:-}" || -n "${crc:-}" ]]; then echo " ATA: realloc=${realloc:-?} pending=${pending:-?} offline_uncorrectable=${offline:-?} crc_errors=${crc:-?} power_on_hours=${poh_ata:-?}" fi if [[ "$diagnosable" == "0" ]]; then echo " DIAGNOSTIC NOTE: Insufficient SMART data to give a reliable wear/health diagnosis for this device." fi echo " Filesystems:" local fsline foundfs=0 while IFS= read -r fsline; do [[ -z "$fsline" ]] && continue foundfs=1 local pname mnt fstype psize pname="${fsline%%|*}"; fsline="${fsline#*|}" mnt="${fsline%%|*}"; fsline="${fsline#*|}" fstype="${fsline%%|*}"; psize="${fsline#*|}" local dfline dfline="$(df -P "$mnt" 2>/dev/null | awk 'NR==2{print $2,$3,$4,$5}')" echo " /dev/$pname mnt=$mnt fstype=$fstype part_size=$psize df(total used avail use%)=($dfline)" done < <(fs_usage_for_disk "$d" || true) [[ "$foundfs" == "0" ]] && echo " (no mounted partitions found)" if [[ "$DETAIL" == "1" ]]; then echo " --- smartctl raw (trimmed) ---" printf "%s\n" "$smart_out" | sed -n '1,140p' echo " --- end ---" fi echo done < <(list_disks) if [[ "$overall_sev" == "2" ]]; then echo "Overall: CRITICAL" exit 2 elif [[ "$overall_sev" == "1" ]]; then echo "Overall: WARNING/INCONCLUSIVE" exit 1 else echo "Overall: OK" exit 0 fi } main_json() { local overall_sev=0 local first=1 echo "{" echo " \"timestamp\": \"$(date -Is)\"," echo " \"devices\": [" while read -r d; do [[ -z "$d" ]] && continue local dtype model serial size tran rota smart_out dtype="$(smart_scan_type_for "$d")" model="$(lsblk -dn -o MODEL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" serial="$(lsblk -dn -o SERIAL "/dev/$d" 2>/dev/null | sed 's/[[:space:]]\+/ /g' | sed 's/^ //;s/ $//')" size="$(lsblk -dn -o SIZE "/dev/$d" 2>/dev/null | head -n1)" tran="$(lsblk -dn -o TRAN "/dev/$d" 2>/dev/null | head -n1)" rota="$(lsblk -dn -o ROTA "/dev/$d" 2>/dev/null | head -n1)" smart_out="$(smart_run "$d" "$dtype")" local smart_access smart_access="$(printf "%s\n" "$smart_out" | smart_access_state)" local health temp health="$(printf "%s\n" "$smart_out" | parse_health)" temp="$(printf "%s\n" "$smart_out" | parse_temp_c)" local nvme_crit_warn media_err unsafe_shutdowns poh duw dur nvme_crit_warn="$(printf "%s\n" "$smart_out" | parse_nvme_int "Critical Warning")" media_err="$(printf "%s\n" "$smart_out" | parse_nvme_int "Media and Data Integrity Errors")" unsafe_shutdowns="$(printf "%s\n" "$smart_out" | parse_nvme_int "Unsafe Shutdowns")" poh="$(printf "%s\n" "$smart_out" | parse_nvme_int "Power On Hours")" duw="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_written)" dur="$(printf "%s\n" "$smart_out" | parse_nvme_bytes_units_read)" local realloc pending offline crc poh_ata realloc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Reallocated_Sector_Ct")" pending="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Current_Pending_Sector")" offline="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Offline_Uncorrectable")" crc="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "UDMA_CRC_Error_Count")" poh_ata="$(printf "%s\n" "$smart_out" | parse_ata_attr_raw "Power_On_Hours")" local wear_used wear_rem wear_source wear_conf IFS="|" read -r wear_used wear_rem wear_source wear_conf < <(wear_probe "$smart_out" || true) local wear_known=0 [[ "$wear_used" =~ ^[0-9]+$ ]] && wear_known=1 local diagnosable=1 if [[ "$smart_access" != "OK" ]]; then diagnosable=0 elif [[ "$wear_known" != "1" && "$health" == "UNKNOWN" ]]; then diagnosable=0 fi local sev sev="$(severity_from_metrics "$health" "$temp" "${wear_used:-}" "${realloc:-0}" "${pending:-0}" "${offline:-0}" "${nvme_crit_warn:-0}" "${media_err:-0}")" if [[ "$diagnosable" == "0" && "$sev" -lt 1 ]]; then sev=1 fi (( sev > overall_sev )) && overall_sev="$sev" [[ "$first" == "1" ]] || echo " ," first=0 # Filesystems local fs_items="" local fsline while IFS= read -r fsline; do [[ -z "$fsline" ]] && continue local pname mnt fstype psize pname="${fsline%%|*}"; fsline="${fsline#*|}" mnt="${fsline%%|*}"; fsline="${fsline#*|}" fstype="${fsline%%|*}"; psize="${fsline#*|}" local df_total df_used df_avail df_usep read -r df_total df_used df_avail df_usep < <(df -P "$mnt" 2>/dev/null | awk 'NR==2{print $2,$3,$4,$5}') fs_items+="{\"partition\":\"/dev/$pname\",\"mount\":$(json_escape "$mnt"),\"fstype\":$(json_escape "$fstype"),\"part_size\":$(json_escape "$psize"),\"df\":{\"total\":$(json_escape "${df_total:-}"),\"used\":$(json_escape "${df_used:-}"),\"avail\":$(json_escape "${df_avail:-}"),\"use_pct\":$(json_escape "${df_usep:-}")}}," done < <(fs_usage_for_disk "$d" || true) local fs_json="[]" [[ -n "$fs_items" ]] && fs_json="[${fs_items%,}]" cat <