use CAS for threadgroup memory space#60
Conversation
@testset "AtomixMetalExt:test_inc_threadgroup" begin
A = Metal.MtlVector(Float32(1):Float32(3))
metal() do
GC.@preserve A begin
B = MtlThreadGroupArray(Float32, 3)
refB = Atomix.IndexableRef(B, (1,))
pre, post = Atomix.modify!(refB, +, Float32(1))
A[1] = B[1]
A[2] = pre
A[3] = post
end
end
@test collect(A) == Float32[2, 1, 2]
end
@maleadt I'm trying to add this as a test, but it doesn't work and I'm not entirely sure why: caused by: NSError: Compiler encountered an internal error (AGXMetalG16G_B0, code 3)
Stacktrace:
[1] Metal.MTL.MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
@ Metal.MTL ~/.julia/packages/Metal/Pwc41/lib/mtl/compute_pipeline.jl:28
[2] macro expansion
@ ~/.julia/packages/Metal/Pwc41/src/compiler/compilation.jl:187 [inlined]
[3] macro expansion
@ ~/.julia/packages/ObjectiveC/UNTzb/src/os.jl:264 [inlined]
[4] macro expansion
@ ~/.julia/packages/Metal/Pwc41/src/compiler/compilation.jl:182 [inlined]
[5] (::Metal.var"#175#176"{GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, @NamedTuple{ir::String, air::Vector{UInt8}, metallib::Vector{UInt8}, entry::String}})()
@ Metal ~/.julia/packages/ObjectiveC/UNTzb/src/foundation.jl:644
[6] macro expansion
@ ~/.julia/packages/ObjectiveC/UNTzb/src/foundation.jl:572 [inlined]
[7] macro expansion
@ ./lock.jl:273 [inlined]
[8] ObjectiveC.Foundation.NSAutoreleasePool(f::Metal.var"#175#176"{GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, @NamedTuple{ir::String, air::Vector{UInt8}, metallib::Vector{UInt8}, entry::String}})
@ ObjectiveC.Foundation ~/.julia/packages/ObjectiveC/UNTzb/src/foundation.jl:564
[9] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{ir::String, air::Vector{UInt8}, metallib::Vector{UInt8}, entry::String})
@ Metal ~/.julia/packages/ObjectiveC/UNTzb/src/foundation.jl:643
[10] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/execution.jl:270
[11] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/Ecaql/src/execution.jl:159
[12] macro expansion
@ ~/.julia/packages/Metal/Pwc41/src/compiler/execution.jl:189 [inlined]
[13] macro expansion
@ ./lock.jl:273 [inlined]
[14] mtlfunction(f::var"#g#1"{var"#6#7"{MtlDeviceVector{Float32, 1}}}, tt::Type{Tuple{}}; name::Nothing, kwargs::@Kwargs{})
@ Metal ~/.julia/packages/Metal/Pwc41/src/compiler/execution.jl:184
[15] mtlfunction
@ ~/.julia/packages/Metal/Pwc41/src/compiler/execution.jl:182 [inlined]
[16] macro expansion
@ ~/.julia/packages/Metal/Pwc41/src/compiler/execution.jl:85 [inlined] |
|
this is weird, on CI:
|
maleadt
left a comment
There was a problem hiding this comment.
Works fine with JuliaGPU/Metal.jl#640
Co-authored-by: Tim Besard <tim.besard@gmail.com>
Co-authored-by: Tim Besard <tim.besard@gmail.com>
|
this should be ready to go now |
Co-authored-by: Tim Besard <tim.besard@gmail.com>
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #60 +/- ##
===========================================
- Coverage 92.95% 52.20% -40.76%
===========================================
Files 6 10 +4
Lines 142 272 +130
===========================================
+ Hits 132 142 +10
- Misses 10 130 +120
Flags with carried forward coverage won't be shown. Click here to find out more. ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
idk why it still fails for my M4, but at least CI is happy? |
|
Are you using the latest Metal.jl? |
seems to pull the latest, because I see: |
|
It crashes on: call float @air.atomic.local.add.f32(float addrspace(3)* bitcast ([12 x i8] addrspace(3)* @threadgroup_memory to float addrspace(3)*), float 1.000000e+00, i32 0, i32 1, i1 true)Can you try removing the Float32 from: https://github.com/JuliaGPU/Metal.jl/blob/9b8c6043c223433a9b601ffc3bf2dcc060017cd9/src/device/intrinsics/atomics.jl#L79-L80 If that works, we'll have to make sure Metal.jl only dispatches to |
|
unfortunately, removing that results in AtomixMetalExt:test_inc | 1 1 0.2s
AtomixMetalExt:test_inc_threadgroup: Error During Test at /Users/akako/Documents/github/Atomix.jl/test/test_atomix_metal.jl:72
Got exception outside of a @test
InvalidIRError: compiling MethodInstance for (::var"#g#1"{var"#6#7"{MtlDeviceVector{Float32, 1}}})() resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to gpu_malloc)
Stacktrace:
[1] malloc
@ ~/.julia/packages/GPUCompiler/Ecaql/src/runtime.jl:85
[2] gc_pool_alloc
@ ~/.julia/packages/GPUCompiler/Ecaql/src/runtime.jl:116
[3] modify!
@ ~/.julia/packages/Atomix/UFuJD/ext/AtomixMetalExt.jl:38
[4] modify!
@ ~/.julia/packages/Atomix/UFuJD/src/generic.jl:120
[5] #6
@ ~/Documents/github/Atomix.jl/test/test_atomix_metal.jl:79
[6] g
@ ~/Documents/github/Atomix.jl/test/test_atomix_metal.jl:12
Reason: unsupported dynamic function invocation (call to atomic_fetch_add_explicit)
Stacktrace:
[1] modify!
@ ~/.julia/packages/Atomix/UFuJD/ext/AtomixMetalExt.jl:38
[2] modify!
@ ~/.julia/packages/Atomix/UFuJD/src/generic.jl:120
[3] #6
@ ~/Documents/github/Atomix.jl/test/test_atomix_metal.jl:79
[4] g
@ ~/Documents/github/Atomix.jl/test/test_atomix_metal.jl:12
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erroneous code with Cthulhu.jl |
|
I also don't understand why it seemingly works on older Apple M chips (CI?), not M4 |
|
Ah right, that's because Metal.jl doesn't have
I don't think this is related to M4; your code is somehow still dispatching to an |
|
I'm getting the error by just running tests of this repo locally |
|
That's... interesting. Still, can you look at the debug info where the intrinsic comes from? |
|
idk why, but it works now... (@gpu) pkg> st
Status `~/.julia/environments/gpu/Project.toml`
[a9b6321e] Atomix v1.1.1 `../../../Documents/github/Atomix.jl`
[dde4c033] Metal v1.7.0
using Test
using Metal, Atomix
using Atomix
using Atomix.Internal: referenceable
using Atomix: @atomic, @atomicreplace, @atomicswap
function metal(f)
function g()
f()
nothing
end
Metal.@metal g()
end
@testset "" begin
A = Metal.MtlVector(Float32(1):Float32(3))
metal() do
GC.@preserve A begin
B = Metal.MtlThreadGroupArray(Float32, 3)
B[1] = A[1]
refB = Atomix.IndexableRef(B, (1,))
pre, post = Atomix.modify!(refB, +, Float32(1))
A[1] = B[1]
A[2] = pre
A[3] = post
end
end
@test collect(A) == Float32[2, 1, 2]
end
> julia --startup-file=no --project=@gpu test.jl
Test Summary: | Pass Total Time
| 1 1 7.0s |
|
So running tests works fine? Maybe you had a |
yes, the Manifest.toml was the problem. But now I'm confused why was this not enough: diff --git a/test/runtests.jl b/test/runtests.jl
index 80f8d1f..e3ad3fd 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -139,7 +139,8 @@ end
# julia> Pkg.test("Atomix", test_args=["--Metal"])
if "--Metal" in ARGS
import Pkg
- Pkg.add("Metal")
+ Pkg.develop(path="/Users/akako/Documents/github/Metal.jl")
+ Pkg.update()
include("test_atomix_metal.jl")
elseif "--CUDA" in ARGS
import PkgI had to delete the |
|
I'm not sure. In any case, the compat of Metal.jl should probably be bumped to v1.7.0, which incidentally should have caught the above issue. |
cb803c5 to
7462933
Compare
|
@maleadt true, ok now ready to merge I think |
fix #59
depends on JuliaGPU/Metal.jl#636