Skip to content

Commit 512d4b0

Browse files
committed
Merge branch 'main' of github.com:mlfoundations/evalchemy
2 parents 3c24578 + d7b0018 commit 512d4b0

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

eval/distributed/launch_simple.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@
3939
"gpus_per_node": 4,
4040
"internet": True,
4141
},
42+
{
43+
"name": "leonardo",
44+
"hostname_pattern": r".*leonardo.*",
45+
"eval_sbatch_filename": "simple_leonardo.sbatch",
46+
"gpus_per_node": 4,
47+
},
4248
]
4349

4450

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash -x
2+
#SBATCH --nodes={num_nodes}
3+
#SBATCH --ntasks-per-node=1
4+
#SBATCH --gres=gpu:4
5+
#SBATCH --time={time_limit}
6+
#SBATCH --job-name={job_name}
7+
#SBATCH --exclude=lrdn[1606,2776,2425,2808,3064,3064,1953,2414,1506,1718,1779,2828,2354,3279,1370,2595,2751,2921,2368,2976,2733,2277,3136,2013,2952,1427,2682,2349,1655,1390,3151,3130,2002,2654,2101,2358,1597,2585,2900,2687,3165,3031,2798,2530,2344,1384,1420,1474,1509,1520,1556,1607,1647,1810,1927,2000,2028,2056,2120,2136,2371,2384,2444,2465,2479,2563,2598,2652,2716,2731,2746,2755,2772,2775,2792,2794,2917,2926,2927,3110,3221,3395,0666]
8+
#SBATCH --mail-type=END,TIME_LIMIT,FAIL
9+
#SBATCH --mail-user=dcft-slurm-notifs-aaaap7wt363mcsgryaejj2o6dm@dogs-and-ml.slack.com
10+
11+
# EXIT ON FAILURE
12+
set -e
13+
14+
# MODULES
15+
module load cuda/12.4 nccl/12.4
16+
17+
# ENVIRONMENT VARIABLES - EVALCHEMY, HF_HUB_CACHE, and EVALCHEMY_ACTIVATE_ENV
18+
# source /work/10159/rmarten/vista/dcft/dcft_private/hpc/dotenv/tacc.env
19+
source /leonardo_work/EUHPC_E03_068/DCFT_shared/dcft_private/hpc/dotenv/leonardo.env
20+
source /leonardo_work/EUHPC_E03_068/DCFT_shared/mamba/bin/activate /leonardo_work/EUHPC_E03_068/DCFT_shared/evalchemy/env/cpu-evalchemy
21+
22+
# CONDA
23+
$EVALCHEMY_ACTIVATE_ENV
24+
25+
# DOWNLOAD MODEL AND DATASET
26+
MODEL_NAME={model_name}
27+
INPUT_DATASET={input_dataset}
28+
OUTPUT_DATASET={output_dataset}
29+
srun --nodes=1 huggingface-cli download $MODEL_NAME --cache-dir $HF_HUB_CACHE
30+
srun --nodes=1 huggingface-cli download $INPUT_DATASET --cache-dir $HF_HUB_CACHE --repo-type dataset
31+
32+
# RUN SHARDED INFERENCE
33+
srun --output={logs_dir}/%x_%j_%n.out bash -c 'echo -e "GLOBAL_SIZE: ${SLURM_JOB_NUM_NODES}\nRANK: ${SLURM_NODEID}\nMODEL: '$MODEL_NAME'\nINPUT_DATASET: '$INPUT_DATASET'\nOUTPUT_DATASET: '$OUTPUT_DATASET'"'
34+
srun --output={logs_dir}/%x_%j_%n.out bash -c 'python $EVALCHEMY/eval/distributed/process_shard.py --global_size ${SLURM_JOB_NUM_NODES} --rank ${SLURM_NODEID} --input_dataset '${INPUT_DATASET}' --model_name '${MODEL_NAME}' --output_dataset '${OUTPUT_DATASET}' --upload'
35+
36+
# COMPUTE SCORES
37+
srun --nodes=1 python -m eval.eval --model precomputed_hf --model_args "repo_id={output_dataset}",model="{model_name}" --tasks {tasks_str} --output_path logs --use_database

0 commit comments

Comments
 (0)