#!/bin/bash
# List of clusters (Host names defined in ~/.ssh/config)
SERVERS=("Narval" "Beluga" "Nibi" "Mila")
# Temporary file to store results
TMPFILE=$(mktemp /tmp/job_status.XXXXXX)
TOTAL_JOBS=0
TOTAL_PENDING=0
for SERVER in "${SERVERS[@]}"; do
echo "Connecting to $SERVER"
# Use --login so that PATH/module init is loaded; escape $USER so it expands remotely
OUTPUT=$(ssh -t "$SERVER" \
"bash --login -c 'tot=\$(squeue -h -u \$USER | wc -l); pend=\$(squeue -h -u \$USER -t PD | wc -l); echo __COUNTS__:\$tot:\$pend'" 2>/dev/null )
LINE=$(echo "$OUTPUT" | tr -d '\r' | grep "__COUNTS__" | tail -n 1)
JOB_COUNT=$(echo "$LINE" | cut -d: -f2)
PENDING_COUNT=$(echo "$LINE" | cut -d: -f3)
[[ "$JOB_COUNT" =~ ^[0-9]+$ ]] || JOB_COUNT=0
[[ "$PENDING_COUNT" =~ ^[0-9]+$ ]] || PENDING_COUNT=0
RUNNING_COUNT=$((JOB_COUNT - PENDING_COUNT))
echo "$SERVER,$JOB_COUNT,$PENDING_COUNT,$RUNNING_COUNT" >> "$TMPFILE"
TOTAL_JOBS=$((TOTAL_JOBS + JOB_COUNT))
TOTAL_PENDING=$((TOTAL_PENDING + PENDING_COUNT))
done
TOTAL_RUNNING=$((TOTAL_JOBS - TOTAL_PENDING))
printf "\n%-10s | %-12s | %-13s | %-13s\n" "Server" "Total Jobs" "Pending Jobs" "Running Jobs"
printf "%s\n" "---------------------------------------------------------------"
while IFS=',' read -r SERVER JOB_COUNT PENDING_COUNT RUNNING_COUNT; do
printf "%-10s | %-12s | %-13s | %-13s\n" "$SERVER" "$JOB_COUNT" "$PENDING_COUNT" "$RUNNING_COUNT"
done < "$TMPFILE"
printf "%s\n" "---------------------------------------------------------------"
printf "%-10s | %-12s | %-13s | %-13s\n" "TOTAL" "$TOTAL_JOBS" "$TOTAL_PENDING" "$TOTAL_RUNNING"
how to use:
./cluster_status.sh
......認証.......
Server | Total Jobs | Pending Jobs | Running Jobs
---------------------------------------------------------------
Narval | 120 | 2 | 118
Beluga | 299 | 110 | 189
Nibi | 194 | 193 | 1
Mila | 507 | 309 | 198
---------------------------------------------------------------
TOTAL | 1120 | 614 | 506
squeue -u naganuma.hiroki | grep PD | awk '{ print $1 }' | tr '\n' ' ' | head -n 1 | awk '1'
squeue | grep XXXXX | awk '{ print $1 }' | wc -l
Error message:
error: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:xxxx)>
Solution:
export PYTHONHTTPSVERIFY=0
export SSL_CERT_DIR=/etc/ssl/certs
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
以下のコマンドで、メモリ解放できる
pkill -u $(whoami)
qstat | grep qw | awk '{ print $1 }' | tr '\n' ' ' | head -n 1 | awk '1'
qstat -f | grep " 0/0/80" | grep "gpu" | grep -v " d" | wc -l