diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml index bf16b67656..775b88cfd3 100644 --- a/.github/workflows/build_cc.yml +++ b/.github/workflows/build_cc.yml @@ -32,7 +32,7 @@ jobs: python-version: '3.11' - uses: lukka/get-cmake@latest - run: python -m pip install uv - - run: python -m uv pip install --system tensorflow + - run: source/install/uv_with_retry.sh pip install --system tensorflow - name: Download libtorch run: | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml index 799a55e9ff..ebbfc4d960 100644 --- a/.github/workflows/test_cc.yml +++ b/.github/workflows/test_cc.yml @@ -27,7 +27,7 @@ jobs: mpi: mpich - uses: lukka/get-cmake@latest - run: python -m pip install uv - - run: python -m uv pip install --system tensorflow + - run: source/install/uv_with_retry.sh pip install --system tensorflow - name: Download libtorch run: | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip @@ -49,7 +49,7 @@ jobs: # test lammps - run: | export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - python -m uv pip install --system -e .[cpu,test,lmp] mpi4py + source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp] mpi4py env: DP_BUILD_TESTING: 1 if: ${{ !matrix.check_memleak }} diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index d97b1f9431..703d0ea2fe 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -47,10 +47,10 @@ jobs: && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3 if: false # skip as we use nvidia image - run: python -m pip install -U uv - - run: python -m uv pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" + - run: source/install/uv_with_retry.sh pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" - run: | export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - python -m uv pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py + source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1 diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 0f9fc61acd..3cf56ecbd3 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -25,10 +25,10 @@ jobs: python-version: ${{ matrix.python }} - run: python -m pip install -U uv - run: | - uv pip install --system mpich - uv pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ + source/install/uv_with_retry.sh pip install --system mpich + source/install/uv_with_retry.sh pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') - uv pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py + source/install/uv_with_retry.sh pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py env: # Please note that uv has some issues with finding # existing TensorFlow package. Currently, it uses diff --git a/source/install/uv_with_retry.sh b/source/install/uv_with_retry.sh new file mode 100755 index 0000000000..2d9a524f6b --- /dev/null +++ b/source/install/uv_with_retry.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# This script is used to retry the uv command if the error "error decoding response body" is encountered. +# See also: +# https://github.com/astral-sh/uv/issues/2586 +# https://github.com/astral-sh/uv/issues/3456 +# https://github.com/astral-sh/uv/issues/3514 +# https://github.com/astral-sh/uv/issues/4402 +tmpstderr=$(mktemp) +max_retry=3 +while true; do + uv "$@" 2> >(tee -a "${tmpstderr}" >&2) + exit_code=$? + # exit if ok + if [ $exit_code -eq 0 ]; then + rm -f "${tmpstderr}" + exit 0 + fi + # check if "error decoding response body" is in the stderr + if grep -q "error decoding response body" "${tmpstderr}"; then + echo "Retrying uv in 1 s..." + max_retry=$((max_retry - 1)) + if [ $max_retry -eq 0 ]; then + echo "Max retry reached, exiting..." + rm -f "${tmpstderr}" + exit 1 + fi + sleep 1 + else + rm -f "${tmpstderr}" + exit $exit_code + fi +done