Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions scripts/lumi/demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env bash
#
# Demo script for running multinode jobs on LUMI. You can run this as a batch job using
# sbatch or as part of an interactive session by running this script as an executable.
#
#SBATCH --job-name=demo
#SBATCH --account=project_462000229
#SBATCH --output=/scratch/project_462000229/logs/%j.log
#SBATCH --nodes=128 # Total number of nodes
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank
#SBATCH --cpus-per-task=6
#SBATCH --time=48:00:00
#SBATCH --time-min=12:00:00
#SBATCH --mem=0 # All memory on the node
#SBATCH --partition=standard-g

module load LUMI/24.03 partition/G

## Container-dependent settings
export OLMO_CONTAINER=$PROJECT_DIR/containers/lumi-torch25rc-rocm62-py312.sif
export ROCM_PATH=/opt/rocm
export CONDA_ENV=pytorch
export PYTHONPATH=.:${PYTHONPATH}

## General LUMI settings (these rarely change)
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export MPICH_GPU_SUPPORT_ENABLED=1
export NCCL_SOCKET_IFNAME=hsn
export NCCL_NET_GDR_LEVEL=3
export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID}
export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
export CXI_FORK_SAFE=1
export CXI_FORK_SAFE_HP=1
export FI_CXI_DISABLE_CQ_HUGETLB=1
export GPU_MAX_HW_QUEUES=8
# We need to set this to avoid "Cassini Event Queue overflow detected." errors.
export FI_CXI_DEFAULT_CQ_SIZE=131072

## Job settings
export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints
export HF_DATASETS_OFFLINE=1
export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1
# Try playing with max_split_size_mb if you run into OOM errors.
#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128

## Debug settings
#export NCCL_DEBUG=INFO
#export FI_LOG_LEVEL=INFO

srun \
--cpus-per-task=$SLURM_CPUS_PER_TASK \
--distribution=block:block \
--kill-on-bad-exit \
scripts/run_with_environment.sh \
singularity exec \
-B"$PROJECT_DIR:$PROJECT_DIR" \
-B"$FLASH_DIR:$FLASH_DIR" \
-B"$SCRATCH_DIR:$SCRATCH_DIR" \
-B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \
$OLMO_CONTAINER \
scripts/lumi/run-in-container.sh \
python scripts/train.py configs/mitchish1-s3.yaml \
"${@}"
6 changes: 2 additions & 4 deletions scripts/lumi/log_into_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ srun --interactive --pty --jobid=$1 \
-B"$PROJECT_DIR:$PROJECT_DIR" \
-B"$SCRATCH_DIR:$SCRATCH_DIR" \
-B"$FLASH_DIR:$FLASH_DIR" \
-B /opt/cray:/opt/cray \
-B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
-B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
$PROJECT_DIR/containers/llm-lumi-torch23_latest.sif \
-B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \
$OLMO_CONTAINER \
fish
6 changes: 2 additions & 4 deletions scripts/lumi/lumi-interactive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,5 @@ singularity shell \
-B"$PROJECT_DIR:$PROJECT_DIR" \
-B"$SCRATCH_DIR:$SCRATCH_DIR" \
-B"$FLASH_DIR:$FLASH_DIR" \
-B /opt/cray:/opt/cray \
-B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
-B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
$PROJECT_DIR/containers/llm-lumi-torch23_latest.sif
-B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \
$OLMO_CONTAINER
7 changes: 7 additions & 0 deletions scripts/lumi/run-in-container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
# Put setup of conda in an env variable if conda is needed
if [[ ! -z "${CONDA_ENV}" ]]; then
source /opt/miniconda3/bin/activate ${CONDA_ENV}
fi

${@}
6 changes: 2 additions & 4 deletions scripts/pyspy_all_nodes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ srun --overlap --jobid $1 \
-B"$PROJECT_DIR:$PROJECT_DIR" \
-B"$SCRATCH_DIR:$SCRATCH_DIR" \
-B"$FLASH_DIR:$FLASH_DIR" \
-B /opt/cray:/opt/cray \
-B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
-B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
$PROJECT_DIR/containers/llm-lumi_latest.sif \
-B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \
$OLMO_CONTAINER \
bash scripts/pyspy_all_processes.sh | sort -s -t: -k1,1