trixi-framework · ranocha · Mar 26, 2026 · Dec 17, 2024 · Apr 21, 2025 · Apr 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -85,6 +85,7 @@ jobs:
           - performance_specializations
           - mpi
           - threaded
+          - kernelabstractions
         include:
           - version: '1.11'
             os: ubuntu-latest

diff --git a/NEWS.md b/NEWS.md
@@ -7,6 +7,23 @@ for human readability.
 
 ## Changes when updating to v0.16 from v0.15.x
 
+#### Added
+
+- Introducing GPU support: Based on work by Jan Kraus and Lars Christmann, Trixi.jl can
+  now partly be executed on GPUs. This includes simulations with flux differencing on
+  `P4estMesh` in 2D and 3D. Adaptive mesh refinement, multi-GPU, source terms, and callbacks
+  are not available, yet. Offloading is achieved via KernelAbstractions.jl kernels,
+  which, at the moment, execute the same code as usually run on CPUs. A backend is selected
+  by passing an appropriate data type as keyword argument `storage_type` to
+  `semidiscretize`. See the
+  [heterogeneous](https://trixi-framework.github.io/TrixiDocumentation/dev/heterogeneous/)
+  section for some instructions on how to port kernels. This is however still preliminaray
+  and will change.
+  GPU kernels are currently CI-tested on NVIDIA GPUs in a buildkite workflow using
+  `TRIXI_TEST=CUDA` ([#2590]).
+
+#### Changed
+
 - The implementation of the local DG (`ViscousFormulationLocalDG`) `solver_parabolic` has been changed for the `P4estMesh`.
 In particular, instead of computing the `ldg_switch` as the dot product of the normal direction with ones,
 i.e., summing up the normal components, the `ldg_switch` is now selected as 

diff --git a/Project.toml b/Project.toml
@@ -86,7 +86,7 @@ EllipsisNotation = "1.0"
 FillArrays = "1.13"
 ForwardDiff = "0.10.38, 1"
 HDF5 = "0.17"
-KernelAbstractions = "0.9.36"
+KernelAbstractions = "0.9.38"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"

diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml
@@ -0,0 +1,16 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+
+[sources]
+Trixi = {path = "../.."}
+
+[compat]
+CUDA = "5.8.2"
+JSON = "1.4.0"
+OrdinaryDiffEqLowStorageRK = "1.12.0"
+TimerOutputs = "0.5.25"
+Trixi = "0.16"
diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -0,0 +1,76 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A = 1.0 # magnitude of speed
+    Ms = 0.1 # maximum Mach number
+
+    rho = 1.0
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+    p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p = p +
+        1.0 / 16.0 * A^2 * rho *
+        (cos(2 * x[1]) * cos(2 * x[3]) +
+         2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+volume_flux = flux_ranocha
+surface_flux = flux_lax_friedrichs
+volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
+solver = DGSEM(polydeg = 5, surface_flux = surface_flux, volume_integral = volume_integral)
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
+
+initial_refinement_level = 1
+trees_per_dimension = (4, 4, 4)
+
+mesh = P4estMesh(trees_per_dimension, polydeg = 1,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true, initial_refinement_level = initial_refinement_level)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver;
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 100.0)
+ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
+
+summary_callback = SummaryCallback()
+
+stepsize_callback = StepsizeCallback(cfl = 0.1)
+
+callbacks = CallbackSet(summary_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+maxiters = 200
+run_profiler = false
+
+# disable warnings when maxiters is reached
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
+                  dt = 1.0,
+                  save_everystep = false, callback = callbacks,
+                  maxiters = maxiters, verbose = false)
+if run_profiler
+    prof_result = CUDA.@profile solve!(integrator)
+else
+    solve!(integrator)
+    prof_result = nothing
+end
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
@@ -0,0 +1,91 @@
+using Trixi
+using CUDA
+using TimerOutputs
+using JSON
+
+function main(elixir_path)
+
+    # setup
+    maxiters = 50
+    initial_refinement_level = 3
+    storage_type = CuArray
+    real_type = Float64
+
+    println("Warming up...")
+
+    # start simulation with tiny final time to trigger compilation
+    duration_compile = @elapsed begin
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = real_type)
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = Float32)
+    end
+
+    println("Finished warm-up in $duration_compile seconds\n")
+    println("Starting simulation...")
+
+    # start the real simulation
+    duration_elixir = @elapsed trixi_include(elixir_path,
+                                             maxiters = maxiters,
+                                             initial_refinement_level = initial_refinement_level,
+                                             storage_type = storage_type,
+                                             real_type = real_type)
+
+    # store metrics (on every rank!)
+    metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
+
+    # read TimerOutputs timings
+    timer = Trixi.timer()
+    metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
+    metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
+
+    # compute performance index
+    latest_semi = @invokelatest (@__MODULE__).semi
+    nrhscalls = Trixi.ncalls(latest_semi.performance_counter)
+    walltime = 1.0e-9 * take!(latest_semi.performance_counter)
+    metrics["PID"] = walltime * Trixi.mpi_nranks() /
+                     (Trixi.ndofsglobal(latest_semi) * nrhscalls)
+
+    # write json file
+    open("metrics.out", "w") do f
+        indent = 2
+        JSON.print(f, metrics, indent)
+    end
+
+    # run profiler
+    maxiters = 5
+    initial_refinement_level = 1
+
+    println("Running profiler (Float64)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float64,
+                  run_profiler = true)
+
+    open("profile_float64.txt", "w") do io
+        show(io, @invokelatest (@__MODULE__).prof_result)
+    end
+
+    println("Running profiler (Float32)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float32,
+                  run_profiler = true)
+
+    open("profile_float32.txt", "w") do io
+        show(io, @invokelatest (@__MODULE__).prof_result)
+    end
+end
+
+# hardcoded elixir
+elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
+
+main(elixir_path)
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
@@ -120,9 +120,14 @@ function trixi_rhs_fct(mesh, equations, solver, cache, args)
 end
 ```
 
-1. Put the inner code in a new function `rhs_fct_per_element`. Besides the index
-   `element`, pass all required fields as arguments, but make sure to `@unpack` them from
-   their structs in advance.
+1. Move the inner code into a new inlined function `rhs_fct_per_element`.
+   ```julia
+   @inline function rhs_fct_per_element(..., element, ...)
+       ...
+   end
+   ```
+   Besides the index `element`, pass all required fields as arguments, but make sure to
+   `@unpack` them from their structs in advance.
 2. Where `trixi_rhs_fct` is called, get the backend, i.e., the hardware we are currently
    running on via `trixi_backend(x)`.
    This will, e.g., work with `u_ode`. Internally, KernelAbstractions.jl's `get_backend`

diff --git a/docs/src/styleguide.md b/docs/src/styleguide.md
@@ -22,6 +22,10 @@ conventions, we apply and enforce automated source code formatting
       and its siblings, put the `cache` first.
     * Some internal functions take a "computational backend" argument, this should always be passed as the first argument.
     * Otherwise, use the order `mesh, equations, solver, cache`.
+    * In course of GPU offloading we sometimes pass `MeshT = typeof(mesh)` instead of
+      `mesh` when the called method needs the type of the mesh for dispatch only. This part
+      of the code is in active development and not considered to be stable API at the
+      moment.
     * If something needs to be specified in more detail for dispatch, put the additional argument before the general one
       that is specified in more detail. For example, we use `have_nonconservative_terms(equations), equations`
       and `dg.mortar, dg`.

diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -33,6 +33,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
+# Change `storage_type` to, e.g., `CuArray` to actually run on GPU
 ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
@@ -50,9 +51,8 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, stepsize_callback)
-# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
-# analysis_callback, save_solution, stepsize_callback)
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation

diff --git a/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,63 @@
+# The same setup as tree_3d_dgsem/elixir_advection_basic.jl
+# to verify GPU support and Adapt.jl support.
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0, -1.0) # minimum coordinates (min(x), min(y), min(z))
+coordinates_max = (1.0, 1.0, 1.0) # maximum coordinates (max(x), max(y), max(z))
+
+# Create P4estMesh with 8 x 8 x 8 elements (note `refinement_level=1`)
+trees_per_dimension = (4, 4, 4)
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1,
+                 periodicity = true)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver;
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+# Change `storage_type` to, e.g., `CuArray` to actually run on GPU
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.2)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 0.05, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
@@ -1,11 +1,27 @@
 # Package extension for adding CUDA-based features to Trixi.jl
 module TrixiCUDAExt
 
-import CUDA: CuArray
+using CUDA: CUDA, CuArray, CuDeviceArray, KernelAdaptor, @device_override
 import Trixi
 
 function Trixi.storage_type(::Type{<:CuArray})
     return CuArray
 end
 
+function Trixi.unsafe_wrap_or_alloc(::KernelAdaptor, vec, size)
+    return Trixi.unsafe_wrap_or_alloc(CuDeviceArray, vec, size)
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Type{<:CuDeviceArray}, vec::CuDeviceArray, size)
+    return reshape(vec, size)
+end
+
+@static if Trixi._PREFERENCE_LOG == "log_Trixi_NaN"
+    @device_override Trixi.log(x::Float64) = ccall("extern __nv_log", llvmcall, Cdouble,
+                                                   (Cdouble,), x)
+    @device_override Trixi.log(x::Float32) = ccall("extern __nv_logf", llvmcall, Cfloat,
+                                                   (Cfloat,), x)
+    # TODO: Trixi.log(x::Float16)
+end
+
 end
diff --git a/src/Trixi.jl b/src/Trixi.jl
@@ -60,7 +60,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
+                          allocate
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices