{"id":2102,"date":"2025-10-09T06:21:44","date_gmt":"2025-10-09T06:21:44","guid":{"rendered":"https:\/\/www.nicktailor.com\/?p=2102"},"modified":"2025-10-09T06:23:42","modified_gmt":"2025-10-09T06:23:42","slug":"slurm-job-cluster-sampler-diagnostics-one-click","status":"publish","type":"post","link":"https:\/\/nicktailor.com\/tech-blog\/slurm-job-cluster-sampler-diagnostics-one-click\/","title":{"rendered":"Slurm Job: Cluster Sampler &#038; Diagnostics (One-Click)"},"content":{"rendered":"\n<div style=\"font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; line-height:1.6; color:#0f172a;\">\n  <p style=\"margin:0 0 1.2em;\">This job collects GPU\/CPU, memory, NUMA, PCIe\/NVLink, NIC\/IB, and optional Nsight\/NCCL\/iperf3 telemetry across all allocated nodes while your workload runs, then bundles everything into a single <code>.tgz<\/code>.<\/p>\n\n  <style>\n    .pp-note{background:#f8fafc;border:1px solid #e2e8f0;padding:0.8em 1em;border-radius:8px;margin:1em 0}\n    .pp-kbd{background:#e2e8f0;border-radius:4px;padding:0.05em 0.35em;font-family:ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;font-size:0.9em}\n    .pp-codewrap{position:relative;margin:1em 0}\n    .pp-copy{position:absolute;top:10px;right:10px;border:1px solid #64748b;background:#0f172a;color:#fff;border-radius:8px;padding:8px 10px;cursor:pointer;font-size:0.85em}\n    .pp-textarea{width:100%;min-height:520px;background:#0b1220;color:#e5e7eb;border-radius:10px;border:1px solid #0f172a33;padding:12px;font-family:ui-monospace, SFMono-Regular, Menlo, Consolas, \"Liberation Mono\", monospace;font-size:0.9em;white-space:pre}\n    .pp-small{color:#475569;font-size:0.95em}\n  <\/style>\n\n  <div class=\"pp-note\">\n    <strong>Usage<\/strong>: Save as <span class=\"pp-kbd\">profile_env.slurm<\/span> and submit:<br \/>\n    <code>sbatch --export=ALL,WORKLOAD=\"torchrun --nproc_per_node=8 train.py --cfg config.yaml\",ENABLE_NSYS=1,RUN_NCCL_TESTS=1,DURATION=1800 profile_env.slurm<\/code>\n  <\/div>\n\n  <div class=\"pp-codewrap\">\n    <button class=\"pp-copy\" onclick=\"(function(btn){try{var ta=btn.parentNode.querySelector('textarea');ta.select();document.execCommand('copy');btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);}catch(e){btn.textContent='Select &#038; copy \u2193';}})(this)\">Copy<\/button>\n    <textarea class=\"pp-textarea\" readonly>\n#!\/usr\/bin\/env bash\n#\n# profile_env.slurm \u2014 cluster-wide performance sampler &#038; diagnostics\n#\n#SBATCH -J prof-playbook\n#SBATCH -o prof-%x-%j.out\n#SBATCH -e prof-%x-%j.err\n#SBATCH &#8211;time=01:00:00\n#SBATCH &#8211;nodes=1\n#SBATCH &#8211;ntasks-per-node=1\n#SBATCH &#8211;gres=gpu:1\n#SBATCH &#8211;cpus-per-task=8\n## Uncomment\/adjust for your site:\n## #SBATCH &#8211;partition=gpu\n## #SBATCH &#8211;qos=normal\n\nset -euo pipefail\n\n#############################\n# Tunables (override via: sbatch &#8211;export=ALL,WORKLOAD=&#8221;python train.py&#8221;,DURATION=900 &#8230;)\n#############################\nWORKLOAD=&#8221;${WORKLOAD:-}&#8221;                 # e.g., &#8220;python train.py&#8221;; if empty, uses a tiny CUDA sample\nDURATION=&#8221;${DURATION:-600}&#8221;              # seconds to sample (upper bound)\nSAMPLE_INT=&#8221;${SAMPLE_INT:-1}&#8221;            # sampler interval (seconds)\nENABLE_NSYS=&#8221;${ENABLE_NSYS:-0}&#8221;          # 1 to record short Nsight Systems traces per node\nNSYS_SECONDS=&#8221;${NSYS_SECONDS:-45}&#8221;       # Nsight trace duration\nRUN_NCCL_TESTS=&#8221;${RUN_NCCL_TESTS:-0}&#8221;    # 1 to run nccl-tests all_reduce_perf\nNCCL_TEST_BIN=&#8221;${NCCL_TEST_BIN:-all_reduce_perf}&#8221;  # path or in $PATH\nIPERF_SERVER=&#8221;${IPERF_SERVER:-}&#8221;         # host\/IP to test NIC TCP throughput (iperf3 server required)\nOUTROOT=&#8221;${OUTROOT:-$PWD}&#8221;\nTAG=&#8221;${TAG:-$(date +%Y%m%d-%H%M%S)}&#8221;\nOUTDIR=&#8221;$OUTROOT\/prof-${SLURM_JOB_ID:-nojob}-$TAG&#8221;\n\nmkdir -p &#8220;$OUTDIR&#8221;\n\necho &#8220;==[JOB]==========================================================&#8221;\necho &#8221; JOBID        : ${SLURM_JOB_ID:-local}&#8221;\necho &#8221; NNODES\/NTASKS: ${SLURM_NNODES:-1} \/ ${SLURM_NTASKS:-1}&#8221;\necho &#8221; GPUS         : ${SLURM_GPUS:-unknown} (CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES)&#8221;\necho &#8221; WORKLOAD     : ${WORKLOAD:-<built-in demo>}&#8221;\necho &#8221; DURATION     : $DURATION  s&#8221;\necho &#8221; SAMPLE_INT   : $SAMPLE_INT s&#8221;\necho &#8221; ENABLE_NSYS  : $ENABLE_NSYS  (for $NSYS_SECONDS s)&#8221;\necho &#8221; RUN_NCCL     : $RUN_NCCL_TESTS&#8221;\necho &#8221; IPERF_SERVER : ${IPERF_SERVER:-<none>}&#8221;\necho &#8221; OUTDIR       : $OUTDIR&#8221;\necho &#8220;==================================================================&#8221;\n\n# Helper to run a command on every allocated node\nnode_run() { local cmd=&#8221;$1&#8243;; srun &#8211;ntasks-per-node=1 &#8211;nodes=&#8221;${SLURM_NNODES:-1}&#8221; &#8211;label bash -lc &#8220;$cmd&#8221;; }\n\n# &#8212;&#8212;&#8211; Per-node init: inventory &#038; topology &#8212;&#8212;&#8211;\nnode_run &#8216;bash -lc &#8221;\nset -euo pipefail\nNDIR=\\&#8221;&#8216;$OUTDIR&#8217;\/\\${HOSTNAME}\\&#8221;\nmkdir -p \\&#8221;\\$NDIR\\&#8221;\n{\n  echo \\&#8221;# Slurm\/Env\\&#8221;\n  env | egrep \\&#8221;SLURM|CUDA|NCCL\\&#8221; || true\n  echo\n  echo \\&#8221;# System\\&#8221;\n  uname -a\n  lsb_release -a 2>\/dev\/null || cat \/etc\/os-release || true\n  date -Iseconds\n  echo\n  echo \\&#8221;# CPU\/NUMA\\&#8221;\n  lscpu || true\n  numactl &#8211;hardware || true\n} > \\&#8221;\\$NDIR\/env.txt\\&#8221;\n\nif command -v nvidia-smi >\/dev\/null 2>&1; then\n  nvidia-smi -L > \\&#8221;\\$NDIR\/gpu_list.txt\\&#8221; || true\n  nvidia-smi topo -m > \\&#8221;\\$NDIR\/gpu_topo.txt\\&#8221; || true\n  nvidia-smi -q -x > \\&#8221;\\$NDIR\/nvidia_smi.xml\\&#8221; || nvidia-smi -q > \\&#8221;\\$NDIR\/nvidia_smi.txt\\&#8221; || true\n  nvidia-smi pmon -c 1 -s um > \\&#8221;\\$NDIR\/pmon_header.txt\\&#8221; || true\nfi\n\n(lspci -nn | egrep -i \\&#8221;nvidia|mellanox|ethernet|infiniband|network\\&#8221; || true) > \\&#8221;\\$NDIR\/lspci.txt\\&#8221;\nif command -v ibstat >\/dev\/null 2>&1; then ibstat > \\&#8221;\\$NDIR\/ibstat.txt\\&#8221; || true; fi\nif command -v ibv_devinfo >\/dev\/null 2>&1; then ibv_devinfo > \\&#8221;\\$NDIR\/ibv_devinfo.txt\\&#8221; || true; fi\n\nip -br link show up | awk &#8216;\\&#8221;$1!=\\&#8221;lo\\&#8221;{print $1}&#8217;\\&#8221; > \\&#8221;\\$NDIR\/ifaces.txt\\&#8221; || true\nwhile read -r IFACE; do\n  (ethtool \\&#8221;\\$IFACE\\&#8221; &#038;&#038; ethtool -k \\&#8221;\\$IFACE\\&#8221; &#038;&#038; ethtool -S \\&#8221;\\$IFACE\\&#8221; | egrep -i \\&#8221;err|drop|disc|pause|fcs|crc\\&#8221; || true) > \\&#8221;\\$NDIR\/ethtool_\\${IFACE}.txt\\&#8221; 2>&#038;1 || true\ndone < \\\"\\$NDIR\/ifaces.txt\\\"\n\"'\n\n# -------- Start background samplers on each node --------\nnode_run 'bash -lc \"\nset -euo pipefail\nNDIR=\\\"'$OUTDIR'\/\\${HOSTNAME}\\\"\nmkdir -p \\\"\\$NDIR\\\"\necho $$ > \\&#8221;\\$NDIR\/sampler_parent.pid\\&#8221;\n\nif command -v nvidia-smi >\/dev\/null 2>&1; then\n  (nvidia-smi dmon -s pucvmet -d &#8216;$SAMPLE_INT&#8217; > \\&#8221;\\$NDIR\/gpu_dmon.log\\&#8221;) &#038;\n  echo $! > \\&#8221;\\$NDIR\/gpu_dmon.pid\\&#8221;\n  ( (nvidia-smi nvlink -s; while true; do nvidia-smi nvlink -s; sleep 10; done) > \\&#8221;\\$NDIR\/nvlink_watch.log\\&#8221; 2>&#038;1 ) &#038;\n  echo $! > \\&#8221;\\$NDIR\/nvlink_watch.pid\\&#8221;\nfi\n\n(command -v mpstat >\/dev\/null 2>&#038;1 &#038;&#038; mpstat -P ALL &#8216;$SAMPLE_INT&#8217; > \\&#8221;\\$NDIR\/mpstat.log\\&#8221;) &#038; echo $! > \\&#8221;\\$NDIR\/mpstat.pid\\&#8221; || true\n(command -v pidstat >\/dev\/null 2>&#038;1 &#038;&#038; pidstat -u -r -d &#8216;$SAMPLE_INT&#8217; > \\&#8221;\\$NDIR\/pidstat.log\\&#8221;) &#038; echo $! > \\&#8221;\\$NDIR\/pidstat.pid\\&#8221; || true\n(command -v vmstat  >\/dev\/null 2>&#038;1 &#038;&#038; vmstat &#8216;$SAMPLE_INT&#8217; > \\&#8221;\\$NDIR\/vmstat.log\\&#8221;)  &#038; echo $! > \\&#8221;\\$NDIR\/vmstat.pid\\&#8221;  || true\n(command -v iostat  >\/dev\/null 2>&#038;1 &#038;&#038; iostat -xy &#8216;$SAMPLE_INT&#8217; > \\&#8221;\\$NDIR\/iostat.log\\&#8221;)  &#038; echo $! > \\&#8221;\\$NDIR\/iostat.pid\\&#8221;  || true\n\nif command -v dcgmi >\/dev\/null 2>&1; then\n  (dcgmi dmon -e 100 -d &#8216;$SAMPLE_INT&#8217; > \\&#8221;\\$NDIR\/dcgm_dmon.log\\&#8221;) &#038;\n  echo $! > \\&#8221;\\$NDIR\/dcgm_dmon.pid\\&#8221;\nfi\n\nif command -v numastat >\/dev\/null 2>&1; then\n  (while true; do\n     for P in $(pgrep -f -n python || true); do numastat -p \\$P; done\n     sleep 10\n   done > \\&#8221;\\$NDIR\/numastat_watch.log\\&#8221; 2>&#038;1) &#038;\n  echo $! > \\&#8221;\\$NDIR\/numastat_watch.pid\\&#8221;\nfi\n&#8220;&#8216;\n\ncleanup() {\n  node_run &#8216;bash -lc &#8221;\n    set -euo pipefail\n    NDIR=\\&#8221;&#8216;$OUTDIR&#8217;\/\\${HOSTNAME}\\&#8221;\n    for f in gpu_dmon pidstat mpstat vmstat iostat dcgm_dmon nvlink_watch numastat_watch; do\n      if [[ -f \\&#8221;\\$NDIR\/\\${f}.pid\\&#8221; ]]; then kill \\$(cat \\&#8221;\\$NDIR\/\\${f}.pid\\&#8221;) 2>\/dev\/null || true; fi\n    done\n  &#8220;&#8216; || true\n}\ntrap cleanup EXIT\n\n# &#8212;&#8212;&#8211; Optional Nsight Systems short trace &#8212;&#8212;&#8211;\nif [[ &#8220;$ENABLE_NSYS&#8221; -eq 1 ]] &#038;&#038; command -v nsys >\/dev\/null 2>&1; then\n  node_run &#8216;bash -lc &#8221;\n    set -euo pipefail\n    NDIR=\\&#8221;&#8216;$OUTDIR&#8217;\/\\${HOSTNAME}\\&#8221;\n    mkdir -p \\&#8221;\\$NDIR\\&#8221;\n    nsys profile -t cuda,osrt,nvtx -o \\&#8221;\\$NDIR\/nsys_\\${HOSTNAME}\\&#8221; &#8211;duration &#8216;$NSYS_SECONDS&#8217; &#8211;stop-on-exit true &#8211;capture-range=none sleep &#8216;$NSYS_SECONDS&#8217;\n  &#8220;&#8216;\nfi\n\n# &#8212;&#8212;&#8211; Run workload (or a tiny CUDA demo) &#8212;&#8212;&#8211;\necho &#8220;== Running workload ==&#8221;\nif [[ -z &#8220;$WORKLOAD&#8221; ]]; then\n  node_run &#8216;bash -lc &#8221;\n    set -euo pipefail\n    NDIR=\\&#8221;&#8216;$OUTDIR&#8217;\/\\${HOSTNAME}\\&#8221;\n    echo \\&#8221;No WORKLOAD provided; running a small CUDA loop&#8230;\\&#8221; | tee -a \\&#8221;\\$NDIR\/workload.log\\&#8221;\n    python &#8211; <<'PY' || sleep 30\nimport torch, time\nif torch.cuda.is_available():\n    a=torch.randn((8192,8192),device='cuda'); b=torch.randn((8192,8192),device='cuda')\n    for i in range(50):\n        c=a@b; torch.cuda.synchronize()\n        time.sleep(0.1)\nelse:\n    time.sleep(30)\nPY\n  \"'\nelse\n  RUNWRK=$(printf '%q ' $WORKLOAD)\n  node_run \"bash -lc 'set -euo pipefail; NDIR=$OUTDIR\/\\${HOSTNAME}; echo Running: $RUNWRK | tee -a \\\"\\$NDIR\/workload.log\\\"; $RUNWRK |&#038; tee -a \\\"\\$NDIR\/workload.log\\\"'\"\nfi\necho \"== Workload section complete ==\"\n\n# -------- Optional network &#038; NCCL checks --------\nif [[ -n \"$IPERF_SERVER\" ]] &#038;&#038; command -v iperf3 >\/dev\/null 2>&1; then\n  node_run &#8220;iperf3 -c $IPERF_SERVER -P 8 -t 30 | tee &#8216;$OUTDIR\/\\${HOSTNAME}\/iperf3_\\${HOSTNAME}.log'&#8221;\nfi\n\nif [[ &#8220;$RUN_NCCL_TESTS&#8221; -eq 1 ]]; then\n  node_run &#8220;bash -lc &#8216;set -euo pipefail; NDIR=$OUTDIR\/\\${HOSTNAME}; if command -v $NCCL_TEST_BIN >\/dev\/null 2>&1; then $NCCL_TEST_BIN -b 8M -e 512M -f 2 -g \\${SLURM_GPUS_PER_NODE:-1} | tee \\&#8221;\\$NDIR\/nccl_all_reduce.log\\&#8221;; else echo \\&#8221;$NCCL_TEST_BIN not found\\&#8221; | tee \\&#8221;\\$NDIR\/nccl_all_reduce.log\\&#8221;; fi'&#8221;\nfi\n\n# &#8212;&#8212;&#8211; Final snapshots &#038; packaging &#8212;&#8212;&#8211;\nnode_run &#8216;bash -lc &#8221;\nset -euo pipefail\nNDIR=\\&#8221;&#8216;$OUTDIR&#8217;\/\\${HOSTNAME}\\&#8221;\nif command -v nvidia-smi >\/dev\/null 2>&1; then\n  nvidia-smi &#8211;query-gpu=index,utilization.gpu,utilization.memory,clocks.sm,clocks.mem,power.draw,temperature.gpu &#8211;format=csv -l 1 -f \\&#8221;\\$NDIR\/nvidia_smi_final.csv\\&#8221; -c 3 || true\nfi\nfree -h > \\&#8221;\\$NDIR\/free.txt\\&#8221; || true\ndf -h   > \\&#8221;\\$NDIR\/df.txt\\&#8221; || true\n{\n  echo \\&#8221;=== tail gpu_dmon ===\\&#8221;; tail -n 30 \\&#8221;\\$NDIR\/gpu_dmon.log\\&#8221; 2>\/dev\/null || true\n  echo\n  echo \\&#8221;=== tail pidstat ===\\&#8221;; tail -n 30 \\&#8221;\\$NDIR\/pidstat.log\\&#8221; 2>\/dev\/null || true\n} > \\&#8221;\\$NDIR\/quick_summary.txt\\&#8221; || true\n&#8220;&#8216;\n\ncleanup\n\ntar -C &#8220;$OUTROOT&#8221; -czf &#8220;$OUTDIR.tgz&#8221; &#8220;$(basename &#8220;$OUTDIR&#8221;)&#8221;\necho &#8220;Artifacts packaged at: $OUTDIR.tgz&#8221;\necho &#8220;Per-node logs under   : $OUTDIR\/<hostname>\/&#8221;\necho &#8220;Done.&#8221;\n    <\/hostname><\/none><\/built-in><\/textarea>\n  <\/div>\n\n  <p class=\"pp-small\">Prefer a direct file? You can also grab the ready-made script: <a href=\"sandbox:\/mnt\/data\/profile_env.slurm\">Download <code>profile_env.slurm<\/code><\/a><\/p>\n<\/div>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>This job collects GPU\/CPU, memory, NUMA, PCIe\/NVLink, NIC\/IB, and optional Nsight\/NCCL\/iperf3 telemetry across all allocated nodes while your workload runs, then bundles everything into a single .tgz. Usage: Save as profile_env.slurm and submit: sbatch &#8211;export=ALL,WORKLOAD=&#8221;torchrun &#8211;nproc_per_node=8 train.py &#8211;cfg config.yaml&#8221;,ENABLE_NSYS=1,RUN_NCCL_TESTS=1,DURATION=1800 profile_env.slurm Copy #!\/usr\/bin\/env bash # # profile_env.slurm \u2014 cluster-wide performance sampler &#038; diagnostics # #SBATCH -J prof-playbook #SBATCH -o prof-%x-%j.out #SBATCH<a href=\"https:\/\/nicktailor.com\/tech-blog\/slurm-job-cluster-sampler-diagnostics-one-click\/\" class=\"read-more\">Read More &#8230;<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[143],"tags":[],"class_list":["post-2102","post","type-post","status-publish","format-standard","hentry","category-hpc"],"_links":{"self":[{"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/posts\/2102","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/comments?post=2102"}],"version-history":[{"count":2,"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/posts\/2102\/revisions"}],"predecessor-version":[{"id":2104,"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/posts\/2102\/revisions\/2104"}],"wp:attachment":[{"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/media?parent=2102"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/categories?post=2102"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/nicktailor.com\/tech-blog\/wp-json\/wp\/v2\/tags?post=2102"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}