From 071da2b08b212484a1a4d324ea008612a05e1a59 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Fri, 17 Apr 2026 10:51:35 +0200 Subject: [PATCH 1/6] Add basic check for finding amd-smi command --- init/eessi_archdetect.sh | 43 ++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index a6716e21..28c6233d 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -189,25 +189,34 @@ accelpath() { fi # check for NVIDIA GPUs via nvidia-smi command - nvidia_smi=$(command -v nvidia-smi) + # nvidia_smi=$(command -v nvidia-smi) + # if [[ $? -eq 0 ]]; then + # log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}" + # nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) + # nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out + # if [[ $? -eq 0 ]]; then + # nvidia_smi_info=$(head -n 1 $nvidia_smi_out) + # cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + # log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'" + # res="accel/nvidia/cc${cuda_cc}" + # log "DEBUG" "accelpath: result: ${res}" + # echo $res + # rm -f $nvidia_smi_out + # else + # log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" + # exit 3 + # fi + # else + # log "DEBUG" "accelpath: nvidia-smi command not found" + # exit 2 + # fi + + # check for AMD GPUs via amd-smi command + amd_smi=$(command -v amd-smi) if [[ $? -eq 0 ]]; then - log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}" - nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) - nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out - if [[ $? -eq 0 ]]; then - nvidia_smi_info=$(head -n 1 $nvidia_smi_out) - cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') - log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'" - res="accel/nvidia/cc${cuda_cc}" - log "DEBUG" "accelpath: result: ${res}" - echo $res - rm -f $nvidia_smi_out - else - log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" - exit 3 - fi + log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}" else - log "DEBUG" "accelpath: nvidia-smi command not found" + log "DEBUG" "accelpath: amd-smi command not found" exit 2 fi } From af0a842c37f5a3c820007a1545c9312732e0a639 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Fri, 17 Apr 2026 11:00:49 +0200 Subject: [PATCH 2/6] Run command and read output --- init/eessi_archdetect.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 28c6233d..b644e1dd 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -215,6 +215,15 @@ accelpath() { amd_smi=$(command -v amd-smi) if [[ $? -eq 0 ]]; then log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}" + amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX) + amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out + if [[ $? -eq 0 ]]; then + amd_smi_info=$(head -n 1 $amd_smi_out) + log "DEBUG" "accelpath: '${amd_smi_info}' derived from amd-smi output" + else + log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out" + exit 3 + fi else log "DEBUG" "accelpath: amd-smi command not found" exit 2 From ee1eff6823c62cc148b2caee0e005d9da5a7140e Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Fri, 17 Apr 2026 11:05:14 +0200 Subject: [PATCH 3/6] Extract compute capability from output --- init/eessi_archdetect.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index b644e1dd..3254d249 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -219,7 +219,11 @@ accelpath() { amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out if [[ $? -eq 0 ]]; then amd_smi_info=$(head -n 1 $amd_smi_out) - log "DEBUG" "accelpath: '${amd_smi_info}' derived from amd-smi output" + amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //') + log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'" + res="accel/amd/${amdgcn_cc}" + echo $res + rm -f $amd_smi_out else log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out" exit 3 From e361c8b5db87eb6ea74c6de09a2ed1db6f4ebdfa Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Fri, 17 Apr 2026 11:18:34 +0200 Subject: [PATCH 4/6] Extract amdgcn_cc from kfd --- init/eessi_archdetect.sh | 72 ++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 13 deletions(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 3254d249..7a6eced6 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -212,24 +212,70 @@ accelpath() { # fi # check for AMD GPUs via amd-smi command - amd_smi=$(command -v amd-smi) - if [[ $? -eq 0 ]]; then - log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}" - amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX) - amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out - if [[ $? -eq 0 ]]; then - amd_smi_info=$(head -n 1 $amd_smi_out) - amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //') - log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'" + # amd_smi=$(command -v amd-smi) + # if [[ $? -eq 0 ]]; then + # log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}" + # amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX) + # amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out + # if [[ $? -eq 0 ]]; then + # amd_smi_info=$(head -n 1 $amd_smi_out) + # amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //') + # log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'" + # res="accel/amd/${amdgcn_cc}" + # echo $res + # rm -f $amd_smi_out + # else + # log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out" + # exit 3 + # fi + # else + # log "DEBUG" "accelpath: amd-smi command not found" + # exit 2 + # fi + + # logic ported from https://github.com/llvm/llvm-project/blob/6e738e187055bbd33b6c3d203b6b55904dfcb624/clang/tools/offload-arch/AMDGPUArchByKFD.cpp + # check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required) + kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes" + + if [[ -d "$kfd_nodes" ]]; then + log "DEBUG" "accelpath: KFD sysfs path found @ ${kfd_nodes}" + amdgcn_cc="" + + # ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10) + # just like LLVM's llvm::sort by node ID. + # Assuming homogeneous nodes for EESSI, grab the first valid GPU and break + for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do + prop_file="$kfd_nodes/$node/properties" + + if [[ -f "$prop_file" ]]; then + # Extract the integer value. 2>/dev/null suppresses read errors. + gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}') + + # If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node) + if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then + # Perform the exact math from AMDGPUArchByKFD.cpp + major=$(( (gfx_ver / 10000) % 100 )) + minor=$(( (gfx_ver / 100) % 100 )) + step=$(( gfx_ver % 100 )) + + # Format as gfx (e.g., 9 0 a -> gfx90a) + amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step) + + log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}" + break + fi + fi + done + + if [[ -n "$amdgcn_cc" ]]; then res="accel/amd/${amdgcn_cc}" - echo $res - rm -f $amd_smi_out + echo "$res" else - log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out" + log "DEBUG" "accelpath: KFD topology found, but no AMD GPUs detected (only CPUs)" exit 3 fi else - log "DEBUG" "accelpath: amd-smi command not found" + log "DEBUG" "accelpath: KFD sysfs path not found. AMD GPU driver not loaded?" exit 2 fi } From daae14b1629162557f9e579ed90bb6d3c312ae39 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Fri, 17 Apr 2026 11:31:33 +0200 Subject: [PATCH 5/6] Refactor nvidia and amd into different methods --- init/eessi_archdetect.sh | 177 ++++++++++++++++++++++----------------- 1 file changed, 102 insertions(+), 75 deletions(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 7a6eced6..64c1df83 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -175,111 +175,138 @@ cpupath(){ fi } -accelpath() { - # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it - log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' " - if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then - if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/nvidia/cc[0-9]+$ ]]; then - echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE} +nvidia_accelpath() { + # Check for NVIDIA GPUs via nvidia-smi command + local nvidia_smi + nvidia_smi=$(command -v nvidia-smi) + + if [[ $? -eq 0 ]]; then + log "DEBUG" "nvidia_accelpath: nvidia-smi command found @ ${nvidia_smi}" + local nvidia_smi_out + nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) + + nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out + if [[ $? -eq 0 ]]; then + local nvidia_smi_info=$(head -n 1 $nvidia_smi_out) + local cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + log "DEBUG" "nvidia_accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'" + + echo "accel/nvidia/cc${cuda_cc}" + rm -f $nvidia_smi_out return 0 else - log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'" + log "DEBUG" "nvidia_accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" + return 3 fi - return 0 + else + log "DEBUG" "nvidia_accelpath: nvidia-smi command not found" + return 2 fi +} - # check for NVIDIA GPUs via nvidia-smi command - # nvidia_smi=$(command -v nvidia-smi) - # if [[ $? -eq 0 ]]; then - # log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}" - # nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) - # nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out - # if [[ $? -eq 0 ]]; then - # nvidia_smi_info=$(head -n 1 $nvidia_smi_out) - # cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') - # log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'" - # res="accel/nvidia/cc${cuda_cc}" - # log "DEBUG" "accelpath: result: ${res}" - # echo $res - # rm -f $nvidia_smi_out - # else - # log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" - # exit 3 - # fi - # else - # log "DEBUG" "accelpath: nvidia-smi command not found" - # exit 2 - # fi - - # check for AMD GPUs via amd-smi command - # amd_smi=$(command -v amd-smi) - # if [[ $? -eq 0 ]]; then - # log "DEBUG" "accelpath: amd-smi command found @ ${amd_smi}" - # amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX) - # amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out - # if [[ $? -eq 0 ]]; then - # amd_smi_info=$(head -n 1 $amd_smi_out) - # amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //') - # log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'" - # res="accel/amd/${amdgcn_cc}" - # echo $res - # rm -f $amd_smi_out - # else - # log "DEBUG" "accelpath: ami-smi command failed, see output in $amd_smi_out" - # exit 3 - # fi - # else - # log "DEBUG" "accelpath: amd-smi command not found" - # exit 2 - # fi - - # logic ported from https://github.com/llvm/llvm-project/blob/6e738e187055bbd33b6c3d203b6b55904dfcb624/clang/tools/offload-arch/AMDGPUArchByKFD.cpp - # check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required) - kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes" +amd_accelpath() { + # Method 1: Check for AMD GPUs via KFD sysfs interface (No amd-smi or Python required) + local kfd_nodes="/sys/devices/virtual/kfd/kfd/topology/nodes" if [[ -d "$kfd_nodes" ]]; then - log "DEBUG" "accelpath: KFD sysfs path found @ ${kfd_nodes}" - amdgcn_cc="" + log "DEBUG" "amd_accelpath: KFD sysfs path found @ ${kfd_nodes}" + local amdgcn_cc="" # ls -1v ensures numeric/version sorting (nodes/0, nodes/1, ..., nodes/10) - # just like LLVM's llvm::sort by node ID. - # Assuming homogeneous nodes for EESSI, grab the first valid GPU and break for node in $(ls -1v "$kfd_nodes" 2>/dev/null); do - prop_file="$kfd_nodes/$node/properties" + local prop_file="$kfd_nodes/$node/properties" if [[ -f "$prop_file" ]]; then # Extract the integer value. 2>/dev/null suppresses read errors. - gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}') + local gfx_ver=$(grep "^gfx_target_version" "$prop_file" 2>/dev/null | awk '{print $2}') # If gfx_ver is non-empty and greater than 0 (0 means it's a CPU node) if [[ -n "$gfx_ver" && "$gfx_ver" -gt 0 ]]; then - # Perform the exact math from AMDGPUArchByKFD.cpp - major=$(( (gfx_ver / 10000) % 100 )) - minor=$(( (gfx_ver / 100) % 100 )) - step=$(( gfx_ver % 100 )) + local major=$(( (gfx_ver / 10000) % 100 )) + local minor=$(( (gfx_ver / 100) % 100 )) + local step=$(( gfx_ver % 100 )) - # Format as gfx (e.g., 9 0 a -> gfx90a) amdgcn_cc=$(printf "gfx%d%d%x" $major $minor $step) - - log "DEBUG" "accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}" + log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from KFD node ${node}" break fi fi done if [[ -n "$amdgcn_cc" ]]; then - res="accel/amd/${amdgcn_cc}" - echo "$res" + echo "accel/amd/${amdgcn_cc}" + return 0 + fi + log "DEBUG" "amd_accelpath: KFD topology found, but no AMD GPUs detected. Falling back to amd-smi." + else + log "DEBUG" "amd_accelpath: KFD sysfs path not found. Falling back to amd-smi." + fi + + # Method 2: Fallback to AMD GPUs via amd-smi command using /tmp files + local amd_smi + amd_smi=$(command -v amd-smi) + + if [[ $? -eq 0 ]]; then + log "DEBUG" "amd_accelpath: amd-smi command found @ ${amd_smi}" + local amd_smi_out + amd_smi_out=$(mktemp -p /tmp amd_smi_out.XXXXX) + + amd-smi static --asic | grep TARGET_GRAPHICS_VERSION 2>&1 > $amd_smi_out + if [[ $? -eq 0 ]]; then + local amd_smi_info=$(head -n 1 $amd_smi_out) + local amdgcn_cc=$(echo $amd_smi_info | sed 's/.*: //') + log "DEBUG" "amd_accelpath: AMDGCN compute capability '${amdgcn_cc}' derived from amd-smi output '${amd_smi_info}'" + + echo "accel/amd/${amdgcn_cc}" + rm -f $amd_smi_out + return 0 else - log "DEBUG" "accelpath: KFD topology found, but no AMD GPUs detected (only CPUs)" - exit 3 + log "DEBUG" "amd_accelpath: amd-smi command failed, see output in $amd_smi_out" + return 3 fi else - log "DEBUG" "accelpath: KFD sysfs path not found. AMD GPU driver not loaded?" - exit 2 + log "DEBUG" "amd_accelpath: amd-smi command not found" + return 2 fi } +accelpath() { + # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it + log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' " + if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then + # Updated regex to allow both NVIDIA and AMD overrides + if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-z]+)$ ]]; then + echo "$EESSI_ACCELERATOR_TARGET_OVERRIDE" + return 0 + else + log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-z]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'" + return 1 + fi + fi + + # 1. Check for NVIDIA GPUs + local nv_res + nv_res=$(nvidia_accelpath) + if [[ $? -eq 0 ]]; then + log "DEBUG" "accelpath: result: ${nv_res}" + echo "$nv_res" + return 0 + fi + + # 2. Check for AMD GPUs + local amd_res + amd_res=$(amd_accelpath) + if [[ $? -eq 0 ]]; then + log "DEBUG" "accelpath: result: ${amd_res}" + echo "$amd_res" + return 0 + fi + + # 3. Fail gracefully if neither is found + log "DEBUG" "accelpath: No supported accelerators found on this system." + exit 2 +} + # Parse command line arguments USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] " From 84b5c91c7f5a581e52017b94661fa041b0749ec9 Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Fri, 17 Apr 2026 12:21:39 +0200 Subject: [PATCH 6/6] Fix amd gfx regex --- init/eessi_archdetect.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 64c1df83..0e7551ce 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -275,11 +275,11 @@ accelpath() { log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' " if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then # Updated regex to allow both NVIDIA and AMD overrides - if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-z]+)$ ]]; then + if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/(nvidia/cc[0-9]+|amd/gfx[0-9a-f]+)$ ]]; then echo "$EESSI_ACCELERATOR_TARGET_OVERRIDE" return 0 else - log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-z]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'" + log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9]+' or 'accel/amd/gfx[0-9a-f]+', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'" return 1 fi fi