[libc][math] Refactor fma implementation to header-only in src/__support/math folder.#163968
Merged
bassiounix merged 3 commits intomainfrom Mar 17, 2026
Conversation
This was referenced Oct 17, 2025
Merged
Merged
Merged
This was referenced Oct 17, 2025
Merged
Merged
Merged
Merged
This was referenced Oct 17, 2025
Merged
Merged
Merged
This was referenced Oct 17, 2025
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
[libc][math] Refactor exp10m1f16 implementation to header-only in src/__support/math folder.
#161119
Merged
This was referenced Oct 17, 2025
Merged
Member
|
@llvm/pr-subscribers-libc Author: Muhammad Bassiouni (bassiounix) ChangesPart of #147386 in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 Full diff: https://github.com/llvm/llvm-project/pull/163968.diff 9 Files Affected:
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 874c2c0779adb..79ba2ea5aa6ff 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -57,6 +57,7 @@
#include "math/expm1.h"
#include "math/expm1f.h"
#include "math/expm1f16.h"
+#include "math/fma.h"
#include "math/frexpf.h"
#include "math/frexpf128.h"
#include "math/frexpf16.h"
diff --git a/libc/shared/math/fma.h b/libc/shared/math/fma.h
new file mode 100644
index 0000000000000..82f1dac61dda2
--- /dev/null
+++ b/libc/shared/math/fma.h
@@ -0,0 +1,23 @@
+//===-- Shared fma function -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_FMA_H
+#define LLVM_LIBC_SHARED_MATH_FMA_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/fma.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::fma;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_FMA_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 0cae228d7f10f..1911481d0649e 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -593,6 +593,14 @@ add_header_library(
libc.src.__support.math.exp10_float16_constants
)
+add_header_library(
+ fma
+ HDRS
+ fma.h
+ DEPENDS
+ libc.src.__support.FPUtil.fma
+)
+
add_header_library(
frexpf128
HDRS
diff --git a/libc/src/__support/math/fma.h b/libc/src/__support/math/fma.h
new file mode 100644
index 0000000000000..d996610167a19
--- /dev/null
+++ b/libc/src/__support/math/fma.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for fma ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_FMA_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_FMA_H
+
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static double fma(double x, double y, double z) {
+ return fputil::fma<double>(x, y, z);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_FMA_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b13da9770dc69..7103c6947eba0 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4722,7 +4722,7 @@ add_entrypoint_object(
HDRS
../fma.h
DEPENDS
- libc.src.__support.FPUtil.fma
+ libc.src.__support.math.fma
)
add_entrypoint_object(
diff --git a/libc/src/math/generic/fma.cpp b/libc/src/math/generic/fma.cpp
index 2ea4ae9961150..3ccdb78846e34 100644
--- a/libc/src/math/generic/fma.cpp
+++ b/libc/src/math/generic/fma.cpp
@@ -7,15 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/fma.h"
-#include "src/__support/common.h"
-
-#include "src/__support/FPUtil/FMA.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/fma.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) {
- return fputil::fma<double>(x, y, z);
+ return math::fma(x, y, z);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index bfcac7884e646..cd4b5ec75f876 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -53,6 +53,7 @@ add_fp_unittest(
libc.src.__support.math.exp10f16
libc.src.__support.math.expf
libc.src.__support.math.expf16
+ libc.src.__support.math.fma
libc.src.__support.math.frexpf
libc.src.__support.math.frexpf128
libc.src.__support.math.frexpf16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 3369cb5e2cf03..7357e24603004 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -90,6 +90,7 @@ TEST(LlvmLibcSharedMathTest, AllDouble) {
EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp2(0.0));
EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp10(0.0));
EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::expm1(0.0));
+ EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::fma(0.0, 0.0, 0.0));
}
#ifdef LIBC_TYPES_HAS_FLOAT128
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 7c98fc7d53796..1902b43216a7c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2791,6 +2791,14 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_fma",
+ hdrs = ["src/__support/math/fma.h"],
+ deps = [
+ ":__support_fputil_fma",
+ ],
+)
+
libc_support_library(
name = "__support_math_frexpf128",
hdrs = ["src/__support/math/frexpf128.h"],
@@ -3093,15 +3101,15 @@ libc_support_library(
name = "__support_math_expm1f16",
hdrs = ["src/__support/math/expm1f16.h"],
deps = [
+ ":__support_fputil_except_value_utils",
":__support_fputil_fma",
":__support_fputil_multiply_add",
":__support_fputil_nearest_integer",
":__support_fputil_polyeval",
":__support_fputil_rounding_mode",
- ":__support_fputil_except_value_utils",
":__support_macros_optimization",
":__support_macros_properties_cpu_features",
- ":__support_math_expxf16_utils"
+ ":__support_math_expxf16_utils",
],
)
@@ -4001,7 +4009,7 @@ libc_math_function(name = "floorf16")
libc_math_function(
name = "fma",
additional_deps = [
- ":__support_fputil_fma",
+ ":__support_math_fma",
],
)
|
2c713ce to
e13d6a0
Compare
e13d6a0 to
0f507ad
Compare
55adc0c to
8ec6dc1
Compare
0f507ad to
b9a0a45
Compare
8ec6dc1 to
cc33f16
Compare
b9a0a45 to
b50aa26
Compare
cc33f16 to
7313817
Compare
b50aa26 to
e2705c2
Compare
Base automatically changed from
users/bassiounix/spr/10-06-_libc_math_refactor_expm1f16_implementation_to_header-only_in_src___support_math_folder
to
main
January 10, 2026 02:15
7313817 to
3c6121a
Compare
lntue
approved these changes
Jan 12, 2026
3c6121a to
bfcc555
Compare
27172d7 to
2c0657c
Compare
Member
Author
Merge activity
|
neonetizen
pushed a commit
to neonetizen/llvm-project
that referenced
this pull request
Mar 17, 2026
alowqie
pushed a commit
to alowqie/llvm-project
that referenced
this pull request
Mar 18, 2026
boomanaiden154
added a commit
to boomanaiden154/llvm-project
that referenced
this pull request
Mar 18, 2026
* [libc++] Add scripts defining two LNT runners for libc++ (#187050)
* [clang] DeducedTypes deduction kind fix and improvement (#186727)
This is a small refactor of how DeducedType and it's derived types are
represented.
The different deduction kinds are spelled out in an enum, and how this
is tracked is simplified, to allow easier profiling.
How these types are constructed and canonicalized is also brought more
in line with how it works for the other types.
This fixes a crash reported here:
https://github.com/llvm/llvm-project/issues/167513#issuecomment-3692962115
* [X86] Fix fcmp+select to min/max lowering (#185594)
This does a few changes that are hard to separate from each other:
* Consider forming minnum/maxnum from setcc+select non-profitable. X86
has instructions specifically for the setcc+select pattern. (Without
this it's hard to get good coverage on this code path.)
* Reduce duplication in the code for forming FMIN/FMAX, by using
predicate inversion (to make setcc and select operand order match) and
predicate invswap (to canonicalize to ordered predicates). This leaves
us with just ordered and NaN-less predicates.
* For non-strict non-less predicates, convert them to strict ones via
invswap (i.e. swapping the operands of both the setcc and select).
Previously this just treated them the same as strict predicates, but I
believe that's incorrect in terms of signed zero handling.
* [InstCombine] Recognize non-negative subtraction patterns (#182597)
Alive2 proofs:
smin pattern: https://alive2.llvm.org/ce/z/-E2Tpc
* [gn] port 55b271ddc1fd968
* [NewPM] Port for AArch64ConditionOptimizer (#186941)
Adds a newPM pass for AArch64ConditionOptimizer.
- Refactors base logic into an Impl class
- Renames old pass with the "Legacy" suffix
- Adds the new pass manager pass using refactored logic
- Updates tests
Context and motivation in
https://llvm.org/docs/NewPassManager.html#status-of-the-new-and-legacy-pass-managers
* [CycleInfo] Support forward declarations (#187029)
Use a class instead of an alias, so that CycleInfo can be
forward-declared.
We can't do the same for Cycle without further changes (a LoopInfo like
CRTP scheme).
* [lldb] Add additional logging to wait_for_file_on_target (#186915)
Occasionally wait_for_file_on_target will time out on the Green Dragon
bots and we're not sure why. I'm adding this logging in an attempt to
get more clues as to what's happening when it fails.
* [AMDGPU] Standardize on using AMDGPU::getNullPointerValue. NFC. (#187037)
AMDGPUTargetMachine also had a static method which did the same thing.
Remove it so that we have a single source of truth.
* [flang] Fix ignore_tkr(c) passing descriptor instead of base address for non-descriptor dummies (#186894)
When ignore_tkr(c) is set and the actual argument is an allocatable or
pointer (stored as a descriptor), the lowering code was unconditionally
returning the descriptor pointer as-is, regardless of whether the dummy
argument expects a descriptor. For bind(c) interfaces with assumed-size
dummies (e.g., cuFFT), the dummy expects a raw pointer, not a
descriptor. Passing the descriptor caused the C function to receive the
wrong address, leading to silent data corruption and invalid descriptor
crashes at deallocation.
The fix adds a check that the early return for ignore_tkr(c) only
applies when the dummy type is itself a descriptor type. When the dummy
expects a base address, the normal path is taken, which correctly
extracts the base address from the descriptor via fir.box_addr.
* [lldb] Fix user-after-free in CommandInterpreter (#187032)
The variable `matches` may be assigned the address of block-scope
`local_matches`, which is defined in a scope strictly smaller than the
scope of `matches`. Towards the end of the function, after
`loacl_matches` has been destroyed, `matches` is accessed, possibly
triggering a user-after-free.
* [flang] Lower anint with math.round (#186039)
Use `math.round` in lowering of `anint` so we can use passes like
`MathToNVVM` to target device code differently.
* Reland Support float8_e3m4 and float8_e4m3 in np_to_memref (#186453) (#186833)
This patch adds support for `float8_e3m4` and `float8_e4m3` in
`np_to_memref.py` by adding the appropriate ctypes structures.
Additionally changes minimum numpy version to 2.1.0 and uses a single
ml_dtypes version of 0.5.0.
* [mlir][Interfaces][NFC] Add early exit to MakeRegionBranchOpSuccessorInputsDead (#186325)
Optimize MakeRegionBranchOpSuccessorInputsDead patterns in
`ControlFlowInterfaces.cpp`:
- Add early exit to `computeReachableValuesFromSuccessorInput` when the
caller only needs to know if there is exactly one reachable value,
avoiding unnecessary traversal.
Assisted-by: Claude Code
Co-authored-by: Yang Bai <yangb@nvidia.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* [SandboxVec][BottomUpVec] Fix crash caused by Cmps with different operand types (#186550)
This patch disables vectorizing Cmps with different operand types
because we can't form a legal vector.
This used to cause an assertion check crash once we attempted to pack
the bundle formed by Cmp's operands.
* [flang][OpenMP][CUDA] Set allocator_idx on privatized allocatable device array descriptors (#186945)
When an allocatable CUDA Fortran device array is privatized in an OpenMP
region, the null descriptor created in the omp.private init region was
missing the allocator_idx attribute. This caused a subsequent allocate()
inside the parallel region to call malloc instead of cudaMalloc, because
the runtime's Descriptor::Allocate() reads allocator_idx from the
descriptor to select the allocator. On some systems it caused
cudaErrorInvalidValue crashes.
This patch sets allocator_idx = 2 (device allocator) on the null
fir.embox in handleNullAllocatable() when the symbol has a CUDA device
attribute, so that the Fortran runtime correctly calls cudaMalloc for
the privatized array.
* [mlir][bytecode] Use getChecked<T>() in bytecode reading to avoid crashes (#186145)
When the bytecode type callback (test-kind=2) calls iface->readType()
for every builtin type, complex types like MemRefType could crash
because the generated reading code used get<T>() which asserts on
invalid parameters, rather than getChecked<T>() which returns null
gracefully.
This change:
- Adds a getChecked<T>() free function helper in
BytecodeImplementation.h that calls T::getChecked(emitError, params)
(no-context form) when a specific override exists, otherwise falls back
to get<T>(). The with-context second branch is intentionally omitted to
avoid instantiating StorageUserBase::getChecked<Args> for types that
only inherit the base template (e.g. ArrayAttr), which would require
complete storage types unavailable in the bytecode reading TU.
- Updates BytecodeBase.td default cBuilder for
DialectAttribute/DialectType to use getChecked<> instead of get<>.
- Updates all custom cBuilder strings in BuiltinDialectBytecode.td.
- Updates the no-args codegen case in BytecodeDialectGen.cpp.
- Adds a regression test in bytecode_callback_with_custom_type.mlir.
Fixes #128308
Assisted-by: Claude Code
* [X86] getMaskNode - perform pre-truncation of oversized scalar mask sources (#187063)
Allows us to use getMaskNode to canonicalize predicate masks in big shift lowering
* [CIR] Split BinOpOverflowOp into separate overflow-checked ops (#186653)
Replace the monolithic cir.binop.overflow operation and its
BinOpOverflowKind enum with three individual operations:
cir.add.overflow, cir.sub.overflow, and cir.mul.overflow.
This follows the same pattern used when BinOp and UnaryOp were
previously split into per-operation ops (cir.add, cir.sub, etc.),
eliminating enum dispatch and enabling per-op traits like Commutative.
* [green-dragon] fix Python and Swig flags (#187052)
* Removed Hardcoded SM Number from Mlir Test (#186917)
This MR removes a hard-coded compute number in an MLIR test. This will
allow the test to not need to be updated in the future. The default
value will come from `NVVMOps.td`.
* Add hybrid function ordering support (#186003)
Allow `--function-order` to be combined with `--reorder-functions`
algorithms. Functions listed in the order file are pinned first
(indices 0..N-1), then the selected algorithm orders remaining
functions starting at index N.
* [mlir][spirv] Fix struct.mlir for stricter spirv-val variable-pointer rules (#186974)
Update `mlir/test/Target/SPIRV/struct.mlir` so it remains valid under
current SPIR-V validator checks in Logical addressing mode.
The recursive struct cases were emitting pointer-allocating globals in
storage classes rejected by `spirv-val`. Adjust those globals to
`Private` while keeping recursive member pointers in `StorageBuffer`,
and update the expected roundtrip types accordingly.
Also add the missing variable-pointer requirements to the module VCE:
- capability: `VariablePointers`
- extension: `SPV_KHR_variable_pointers`
Signed-off-by: Davide Grohmann <davide.grohmann@arm.com>
* Revert "[libc] Avoid host header collisions in full builds (-nostdinc)" (#187079)
Reverts llvm/llvm-project#187025
Fails on openmp bot:
https://lab.llvm.org/buildbot/#/builders/10/builds/24743
('INT64_MIN' macro redefined when used Clang-provided <stdint.h> is
used)
fails on RISC-V-32 bot:
https://lab.llvm.org/buildbot/#/builders/196/builds/17067
due to MPFRNumber constructor not picking the right overload for
uint32_t argument.
* [libc] Refactor core Linux syscalls to use syscall_wrappers (#185983)
This patch initiates the refactoring of Linux syscalls as described in
the RFC (https://discourse.llvm.org/t/rfc-linux-syscall-cleanup/87248/).
It introduces a new infrastructure in
`src/__support/OSUtil/linux/syscall_wrappers/` to house header-only
syscall wrappers. These wrappers utilize `ErrorOr` to provide a
consistent, type-safe interface for error handling across the library,
standardizing how syscall return values are converted into
errno-compatible Error objects.
Summary of changes:
- Created the `syscall_wrappers` directory and added `close.h`,
`read.h`, `write.h`, and `open.h`.
- Moved the existing `getrandom.h` into the new `syscall_wrappers`
directory and updated its callers (including HashTable/randomness.h).
- Refactored core entrypoints (`close`, `read`, `write`, `open`) to use
the new wrappers, removing direct `syscall_impl` logic and manual errno
setting.
- Updated `shm_open.cpp` to use the new `open` wrapper.
- Cleaned up `OSUtil/linux/fcntl.cpp` by removing redundant internal
implementations of `open` and `close`.
- Added a developer guide in `docs/dev/syscall_wrapper_refactor.rst`
outlining the established pattern for future migrations.
---------
Co-authored-by: Michael Jones <michaelrj@google.com>
* [Clang] Add __ob_trap support for implicit integer sign change (#185772)
The `__ob_trap` type specifier can be used to trap (or warn with sanitizers) when overflow or truncation occurs on the specified type.
There was a gap in coverage for this with the `-fsanitize=implicit-integer-sign-change` sanitizer. Fix this by carrying around `__ob_trap` information through `EmitIntegerSignChange()` which allows us to properly trap or warn.
* [mlir][spirv] Add spirv.GroupNonUniformBroadcastFirst Op (#185818)
Add `spirv.GroupNonUniformBroadcastFirst` op and tests.
* [lldb] Fix build on Linux when SEGV_PKUERR is undefined (#186963)
build logs refs:
https://github.com/valord577/nativepkgs/actions/runs/22346192467/job/64661318198
* [Bazel] Fixes ebb3309 (#187090)
This fixes ebb3309975c8e49096d8295a368c93c684bf10f1.
* [mlir][GPU] Refactor, improve constant size information handling (#186907)
1. There was duplicate code between the integer range analysis's
handling of static dimension size information (ex. gpu.known_block_dim
attributes) and the handling during the lowering of those operations.
The code from integer range analysis was given a dialect-wide entry
point (and had its types fixed to be more accurate), which the lowering
templates now call.
2. The templated lowering for block/grid/cluster_dim now produces
precise ranges (indicating the constant value) where one is known, and
the lowerings in rocdl (including those for subgroup_id) have been fixed
appropriately.
3. While I was here, the gpu.dimension enum has been moved to GPUBase so
it lives next to the other enums.
4. The pattern that expands subgroup_id operations now adds any thread
dimension bounds it finds in context.
(Claude was used for an initial round of review, I did the main coding
myself.)
---------
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
* Revert "[LoopUnroll] Remove `computeUnrollCount()`'s return value " (#187035)
Reverts llvm/llvm-project#184529 due to
https://github.com/llvm/llvm-project/pull/184529#issuecomment-4074393657
* [DirectX] Fix assertion in PointerTypeAnalysis with empty global_ctors (#179034)
When `llvm.global_ctors` has no elements (e.g., when all resources are
unused in a shader library), its initializer is a `zeroinitializer`
(`ConstantAggregateZero`) rather than a `ConstantArray`. The previous
code used `cast<ConstantArray>` which asserts on incompatible types:
> "cast<Ty>() argument of incompatible type!"
This patch uses `dyn_cast` and returns early if the initializer is not a
`ConstantArray`, handling the edge case gracefully.
Fixes #178993.
Co-authored-by: Kaitlin Peng <kaitlinpeng@microsoft.com>
* [bazel] NFC: reformat mlir & libc bazel files (#187094)
* [z/OS] Recognize EBCDIC archive magic (#186854)
`z/OS` archives use the same structural layout as traditional Unix
archives but encode all text fields in EBCDIC. The magic string is the
EBCDIC representation of `\"!<arch>\n\" (hex: 5A 4C 81 99 83 88 6E 15)`.
This patch adds recognition of the `z/OS` archive magic to
`identify_magic()` and defines the `ZOSArchiveMagic` constant. This is
the first in a series of patches adding `z/OS` archive support to LLVM.
* [libc][math] Refactor fma implementation to header-only in src/__support/math folder. (#163968)
Part of #147386
in preparation for:
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
* [mlir][GPU] Set nsw/nuw when expanding out subgroup ID (#187099)
There's no world where the subgroup ID (or the intermediate values
needed to compute it) will be negative or will have signed overflow.
This commit adds flags accordingly, which is helpful as this is a rather
low-level rewrite that might run after the analyses that would
ordinarily add these flags.
* [libc][math] Refactor fmaf implementation to header-only in src/__support/math folder. (#163970)
Part of #147386
in preparation for:
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
* [libc][math] Refactor fmaf16 implementation to header-only in src/__support/math folder. (#163977)
* [SLP][NFC] Refactor BinOpSameOpcodeHelper BIT enum (#187067)
More readable syntax and increase type width to avoid silent errors if
we reach 17 members.
* [clang-format] Fix Macros configuration not working with try/catch expansions (#184891)
This is a superseding followup to my previous PR,
https://github.com/llvm/llvm-project/pull/183352.
In my previous PR, I proposed adding TryMacros and CatchMacros
configuration options, similar in spirit to IfMacros and ForEachMacros.
I did so because I noticed that configuration like
`Macros=["TRY_MACRO=try", "CATCH_MACRO(e)=catch(e)]` did not format
configured macro(s) as try/catch blocks. @owenca confirmed in my
previous PR that this observed behavior is undesired, and we should
prefer to fix it rather than introduce new features.
This PR proposes a fix, described in detail in the commit message below
the break. In general terms, it deletes a heuristic from the lexing
phase, where it interacted poorly with the Macros option, and moves its
functionality to the parsing phase instead.
I describe a possibly cleaner fix in [a comment
here](https://github.com/llvm/llvm-project/pull/183352#issuecomment-3992773126),
but it has the disadvantage of unintended behavior changes for
Objective-C code using `try` as an identifier. The fix in this PR avoids
that unintended behavior change; the only behavior change is the bugfix
itself.
cc @HazardyKnusperkeks as previous reviewer, and @mydeveloperday as the
heuristic's original author.
---
The lexer heuristic `tryTransformTryUsageForC()` was intended to allow C
code to use `try` as an identifier by demoting `tok::kw_try` to
`tok::identifier` when the following token did not look like the start
of a try body. However, when `MacroExpander::parseDefinition()` lexed a
macro like `"TRY_MACRO=try"`, the `try` token in the expansion body was
followed by `eof`, triggering the heuristic. This caused `try` to be
demoted to an identifier in the macro definition, so expanded code was
never parsed as a try/catch statement.
Delete `tryTransformTryUsageForC()` and instead guard the two `case
tok::kw_try:` dispatch sites in
`UnwrappedLineParser::parseStructuralElement()`. The guard is restricted
to `Style.isCpp()` (which covers C, C++, and Objective-C) and checks
whether the next non-comment token is `{`, `:`, or `#` -- the tokens
that can legitimately begin a try body or precede one.
The old heuristic also had to check for a preceding `@` token to avoid
demoting `try` in Objective-C `@try` constructs. The parser-level guard
does not need this check because `@try` is routed through `case tok::at`
and dispatched via `getObjCKeywordID()` to `tok::objc_try`, which calls
`parseTryCatch()` directly. The bare `kw_try` token never reaches `case
tok::kw_try` when parsing `@try`.
Assisted-by: Claude (anthropic.com)
* [libc][math] Fix fma bazel build (#187107)
* [flang][OpenMP] Remove unused function declaration, NFC (#187101)
The function `GetNumGeneratedNestsFrom` has been removed, but repeated
local rebases stubbornly inserted the declaration back in.
* [libc]: implement 'iswpunct' entrypoint (#186968)
Added entrypoints:
- baremetal/arm
- baremetal/aarch64
- baremetal/riscv
- darwin/aarch64
- linux/aarch64
- linux/arm
- linux/riscv
- linux/x86_64
- windows
Also added the unit test for iswpunct.
Part of the issue: #185136
* [libc][math] Fix bazel build for fmaf16 (#187111)
* [mlir][llvmir] Fix crash when a CallSiteLoc has a UnknownLoc callee (#186860)
Avoids reading a null StringAttr when no file name is present by
manufacturing a default instead.
* [clang][Driver][SPIRV] Fix assertion when using -emit-llvm (#186824)
In the failing case we are in the link phase with `-emit-llvm` passed,
which means we are going to call `llvm-link` so all inputs are expected
to be `.bc` files, and linker options aren't supported as we aren't
calling a real linker.
I can't imagine anyone wants to pass arguments to `llvm-link`. Just drop
them and warn instead of asserting.
Closes: https://github.com/llvm/llvm-project/issues/186598
Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
* AMDGPU/GlobalISel: RegBankLegalize rules for wave_reduce_umax/umin (#186528)
* AMDGPU/GlobalISel: RegBankLegalize rules for bswap, cvt_ubyte, rcp (#187093)
* [AMDGPU] fold a call to implictarg.ptr to a poison with no-implicitarg-ptr (#186925)
When a caller function with `amdgpu-no-implicitarg-ptr` calls
`llvm.amdgcn.implicitarg.ptr`, a poison value is returned.
* [CIR] Fix missing RegionBranchTerminatorOpInterface declarations (#187112)
After https://github.com/llvm/llvm-project/pull/186832 operations with
RegionBranchTerminatorOpInterface needs to declare
`getMutableSuccessorOperands`.
* [spirv][mlir] Add myself to CODEOWNERS (#187115)
* [AMDGPU][GlobalISel] Switch tests to new reg-bank-select and refresh checks (#186506)
Update AMDGPU GlobalISel tests to use -new-reg-bank-select. These tests
can be updated due to the existing implementation of legalization rules
for G_TRUNC.
* [flang][OpenMP][CUDA] Place privatized device allocatable descriptors in managed memory (#187114)
When an OpenMP private clause privatizes a CUDA Fortran allocatable
device array, the Fortran descriptor for the private copy must be
accessible from both the host and the GPU. Without this change, the
descriptor lives on the host stack (via the OpenMP runtime's
CreateAlloca), which a CUF kernel running on the GPU cannot
dereference—resulting in cudaErrorIllegalAddress.
This patch modifies the omp.private init/dealloc region generation in
PrivateReductionUtils.cpp with three changes:
1. Allocate the descriptor in managed memory
2. Set allocator_idx = 2 on the null fir.embox
3. Free the managed descriptor
Source example:
```
real(8), device, allocatable :: adev(:)
!$omp parallel private(adev)
allocate(adev(10))
!$cuf kernel do <<<*,*>>>
do i = 1, 10
adev(i) = 1.0d0
end do
deallocate(adev)
!$omp end parallel
```
IR before this change:
```
omp.private {type = private} @... : !fir.box<!fir.heap<!fir.array<?xf64>>> init {
^bb0(%arg0: !fir.ref<!fir.box<...>>, %arg1: !fir.ref<!fir.box<...>>):
...
fir.if %3 {
%5 = fir.embox %1(%4) : (...) -> !fir.box<...> // no allocator_idx
fir.store %5 to %arg1 // host-stack alloca
}
omp.yield(%arg1 : ...) // yields host alloca
} dealloc {
...
fir.if %3 { fir.freemem %1 }
omp.yield // no cuf.free
}
```
IR after this change:
```
omp.private {type = private} @... : !fir.box<!fir.heap<!fir.array<?xf64>>> init {
^bb0(%arg0: !fir.ref<!fir.box<...>>, %arg1: !fir.ref<!fir.box<...>>):
%0 = cuf.alloc !fir.box<...> {data_attr = #cuf.cuda<device>} // managed memory
...
fir.if %4 {
%6 = fir.embox %2(%5) {allocator_idx = 2 : i32} // cudaMalloc
fir.store %6 to %0 // managed descriptor
}
omp.yield(%0 : ...) // yields managed ptr
} dealloc {
...
fir.if %3 { fir.freemem %1 }
cuf.free %arg0 : ... {data_attr = #cuf.cuda<device>} // free managed desc
omp.yield
}
```
---------
Co-authored-by: Valentin Clement (バレンタイン クレメン) <clementval@gmail.com>
* [lldb] Upstream arm64e support in ValueObject (#186906)
In #186001, I said the last large chunk of downstream PtrAuth code in
LLDB was the expression evaluator support. However, that wasn't
accurate, as we also have changes to thread this through ValueObject.
* [NVPTX] Fix scoped atomic when given runtime values (#185883)
Summary:
The `__scoped_` atomic builtins will expand to each supported scope in
the case of a runtime value. There are two problems:
1. Singlethreaded scope doesn't exist -> treat as no-op
2. Cluster is not supported on all tagets -> Widen to device scope
This is exactly how the AMDGPU backend handles it.
---------
Co-authored-by: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
* [dsymutil] Fall back to compatible triple in BinaryHolder (#186893)
When dsymutil can't find an exact match in its BinaryHolder, fall back
to a compatible triple instead of erroring out completely.
rdar://171676213
* AMDGPU/GlobalISel: RegBankLegalize rules for s_setreg (#186872)
* [AMDGPU][NFC] Remove kernarg_segment_ptr regbankselect test (#186029)
The legalizer eliminates amdgcn_kernarg_segment_ptr before it reaches
RegBankSelect and there is already a pre-existing
llvm.amdgcn.kernarg.segment.ptr.ll test to test end to end
functionality.
* [ROCDL] Align mfma op description examples with the actual op (#186949)
Improves the example used in ROCDL mfma op descriptions to be more
relevant to its associated op. It is currently a random list of mfma
instructions which are replaced with a single example of the correct
intrinsic.
The description of `mfma.f32.16x16x16bf16.1k` previously :
```
Matrix fused multiply-add (MFMA) intrinsic. Computes `D = A * B + C`
with matrix operands. The `cbsz`, `abid`, and `blgp` attributes control
broadcast and block layout modes.
Example:
'''mlir
// MFMA with f32 inputs and 32-wide f32 accumulator.
%r0 = rocdl.mfma.f32.32x32x1f32 %a0, %b0, %c0, 0, 0, 0 :
(f32, f32, vector<32xf32>) -> vector<32xf32>
// MFMA with i8 inputs and 32-wide i32 accumulator.
%r1 = rocdl.mfma.i32.32x32x4i8 %a1, %a1, %c1, 0, 0, 0 :
(i32, i32, vector<32xi32>) -> vector<32xi32>
// MFMA with bf16 inputs and 32-wide f32 accumulator.
%r2 = rocdl.mfma.f32.32x32x2bf16 %a2, %a2, %c0, 0, 0, 0 :
(vector<2xi16>, vector<2xi16>, vector<32xf32>) -> vector<32xf32>
'''
```
The description of `mfma.f32.16x16x16bf16.1k` now:
```
Matrix fused multiply-add (MFMA) intrinsic. Computes `D = A * B + C`
with matrix operands. The `cbsz`, `abid`, and `blgp` attributes control
broadcast and block layout modes.
Example:
'''mlir
%r0 = mfma.f32.16x16x16bf16.1k %a0, %b0, %c0, 0, 0, 0 : (vector<4xi16>, vector<4xi16>, vector<4xf32>) -> vector<4xf32>
'''
```
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
* [libc] Add a smaller b36_char_to_int (#180841)
For ASCII systems, b36_char_to_int gets compiled into a jump table. That
jump table ends up being pretty large because it covers the range from
'0' (48) to 'z' (122). On size-constrained systems that can assume
ASCII, this patch provides a new flag: LIBC_CONF_CTYPE_SMALLER_ASCII
that forces a smaller implementation that doesn't compile into a jump
table.
* Reapply "[clang][ssaf] Add UnsafeBufferUsage summary extractor for functions (#182941)" (#186899)
This reverts commit 53739c75a8720aaef8032628267ed4fd050af038.
Reapply after module dependency issues are resolved.
(rdar://169191570)
* [orc-rt] Add ExecutorProcessInfo APIs. (#187147)
ExecutorProcessInfo provides information about the process in which the
ORC runtime is running.
* [mlir][spirv] Add comparison and elementwise ternary ops in TOSA Ext Inst Set (#186356)
This patch introduces the following comparison and elementwise ternary
operators:
spirv.Tosa.Select
spirv.Tosa.Equal
spirv.Tosa.Greater
spirv.Tosa.GreaterEqual
Also dialect and serialization round-trip tests have been added.
Signed-off-by: Davide Grohmann <davide.grohmann@arm.com>
* [IR2Vec] Remove redundant death test for invalid TypeID (#187143)
The `NumericIDMapInvalidInputs` test has two `EXPECT_DEATH` calls for
invalid `Type::TypeID` values. The second value (`MaxTypeIDs + 10`) is
redundant: `getIndex` has a one-sided assert, so any value greater or
equal to `MaxTypeIDs` takes the same code path as the value already
tested by the first call.
The test began failing after
https://github.com/llvm/llvm-project/pull/186888, which introduced
`ByteTyID`. The value of `MaxTypeIDs + 10` became 32, which falls
outside the representable range of the `enum`, making the cast UB. To
avoid accidentally triggering this test when adding new types, it should
probably be removed.
* [orc-rt] Require non-empty triples and power-of-two page sizes. (#187151)
* [orc-rt] Add an ExecutorProcessInfo field to Session. (#187155)
This will provide a central location for ORC runtime code (and ORC
runtime API clients) to find executor process information.
* [lldb][Module] Don't try to locate scripting resources without a ScriptInterpreter (#187091)
I'm in the process of moving `SanitizedScriptingModuleName` into
`ScriptInterpreter` as a `virtual` API. The nullptr check inside the
constructor made that more difficult because it implied we may not have
a `ScriptInterpreter` available to call the sanitization API on. Really
the `nullptr` check is redundant because even if we succesfully sanitize
and then locate some scripts, `Module::LoadScriptingResourceInTarget`
bails out if we don't have a `ScriptInterpreter`.
This patch moves the early exit in `LoadScriptingResourceInTarget` to
before we make the call to `LocateExecutableScriptingResources`. That
way we ensure we never get to `SanitizedScriptingModuleName` without a
valid `ScriptInterpreter`.
* [CIR] Add handling for nounwind attributes (#187096)
This adds support for setting the `nounwind` attribute on declarations
and call sites in CIR.
Currently, we have both `nothrow` and `nounwind` in CIR. I've chosen to
use `nothrow` in this PR because it was the most common. I plan to
combine them, using `nounwind` everywhere since that's the LLVM IR
spelling, but that's a more invasive so I'd like to defer it to a future
change.
* [CIR] Implement abstract conditional operator handling for aggregates (#186284)
This implements AggExprEmitter::VisitAbstractConditionalOperator for
CIR.
* [LV] Replace remaining LogicalAnd to vp.merge in EVL optimization. (#184068)
This patch replace the remaining LogicalAnd to vp.merge in the second
pass to not break the `m_RemoveMask` pattern in the optimizeMaskToEVL.
This can help to remove header mask for FindLast reduction (CSA) loops.
PR: https://github.com/llvm/llvm-project/pull/184068
* [orc-rt] Rename ControllerInterface to SimpleSymbolTable. NFCI. (#187164)
This type will be used more contexts than just the controller interface.
* Revert "[LV] Replace remaining LogicalAnd to vp.merge in EVL optimization." (#187170)
Reverts llvm/llvm-project#184068
This hit the cost model assertion in rva23 stage2 build bot.
https://lab.llvm.org/buildbot/#/builders/213/builds/2497
* [NVPTX][AutoUpgrade] atom.load intrinsics should be autoupgraded to monotonic atomicrmw for NVPTX (#187140)
Prior to https://github.com/llvm/llvm-project/pull/179553, the seq_cst
qualifier was being ignored. The expected codegen for these intrinsics
is `atom.relaxed`- which corresponds to `Monotonic`. The fix does to
AutoUpgrade what https://github.com/llvm/llvm-project/pull/185822 does
to clang.
* [WebAssembly] Fold sign-extending shifts into signed loads in FastISel (#185906)
WebAssembly FastISel currently fails to fold sign-extension patterns
composed of zero-extending loads followed by shift operations. This
results in redundant shift and constant instructions in the output.
Before:
i32.load8_u $push3=, 0($0)
i32.const $push0=, 24
i32.shl $push1=, $pop3, $pop0
i32.const $push4=, 24
i32.shr_s $push2=, $pop1, $pop4
The matched shift instruction sequence is removed and safely folded into
a single sign-extending load, erasing the dead code via the
MachineBasicBlock iterator.
After:
i32.load8_s $push0=, 0($0)
Fixed: #184302
* [orc-rt] Rename SimpleSymbolTable::addSymbolsUnique, relax error cond… (#187171)
…ition.
Renames the SimpleSymbolTable addSymbolsUnique method to addUnique. The
new class name (from c727cd9a4b2) already implies that we're adding
symbols.
This commit also relaxes the error condition for addUnique: Rather than
rejecting any duplicate symbols, it only rejects symbols that were
previously added with a different address. This makes it safe to add the
same symbol(s) multiple time, as long as all definitions point to the
same address. The intent of this is to allow ORC runtime components to
unconditionally add their interfaces to symbols, even if that interface
might have been added previously.
* [clang] Add `-verify-directives` cc1 flag (#179835)
Matheus once told me that the various rules I enforce in C++ DR tests
should be checked automatically. This is the patch to check some of
them.
`-verify-directives` is a cc1 flag that checks how `expected` directives
themselves are written. It enforces the following rules on top of
`-verify` mode:
1. Directives have to fully match diagnostic text (but regular
expressions are still allowed).
2. Lexical order of directives must match the order in which diagnostics
are emitted.
3. Each directive must match exactly one diagnostic.
4. Directives has to specify exact source location of the diagnostic,
i.e. wildcards (`*`) are not allowed.
The second rule (about order) is the most significant: it doesn't allow
to e.g. have `expected-note {{declared here}}` somewhere far away from
`expected-error` it attaches to. It also enforces order between notes
themselves.
(This patch comes with rather extensive documentation in the internals
manual. Check it out for more details.)
See #179813 and #179674 for impact of enforcing this mode in C++ DR
tests. Despite my best efforts, this flag uncovered a significant number
of deficiencies that I missed.
I've been already enforcing those rules in C++ DR tests, so I'm going to
roll this out there when #179813 is merged. I did my best to make UX
palatable, so that it can be used outside of C++ DR tests. My hope is
that once this is merged, reviewers can go "make this test work with
`-verify-directives`" instead of painstakingly reconstructing compiler
output from directives scattered all across the test file.
---------
Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
Co-authored-by: Erich Keane <ekeane@nvidia.com>
* [libclc][NFC] Change include style from <...> to "..." (#186537)
project-specific headers should use "". Keep #include <amdhsa_abi.h>
llvm-diff shows no change to libclc.bc for spir--, spir64--, nvptx64--,
nvptx64--nvidiacl, nvptx64-nvidia-cuda and amdgcn-amd-amdhsa-llvm when
LIBCLC_TARGETS_TO_BUILD is "all".
Verified that reversing spir64--/libclc.spv and spir--/libclc.spv to
LLVM bitcode shows no diff.
Also fix `__CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__` guard per
copilot review.
---------
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
* [orc-rt] Relax addUnique assertion to match error condition.
Duplicates are now permitted as long as they're identical (dbdf1accf55).
* [X86][APX] Remove patterns for ArithBinOp (#187018)
We should never select to these _EVEX variants. A follow up of #186049.
* [Clang][Modules] Add regression test for #179178 (#187173)
This patch adds a regression test for a clang header modules failure we
saw after #179178 landed.
I'm too familiar with the norms for clang modules tests, so feel free to
nit-pick.
I've verified that this test passes at HEAD and fails with #179168
applied (i.e., the revert reverted).
* [FMV][AIX] Implement target_clones (cpu-only) (#177428)
This PR implements Function Multi-versioning on AIX using `__attribute__
((target_clones(<feature-list>)))`.
Initially, we will only support specifying a cpu in the version list.
Feature strings (such as "altivec" or "isel") on target_clones will be
implemented in a future PR.
Accepted syntax:
```
__attribute__((target_clones(<OPTIONS>)))
```
where `<OPTIONS>` is a comma separated list of strings, each string is
either:
1) the default string `"default"`
2) a cpu string `"cpu=<CPU>"`, where `<CPU>`is a value accepted by the
`-mcpu` flag.
For example, specifying the following on a function
```
__attribute__((target_clones("default", "cpu=power8", "cpu=power9")))
int foo(int x) { return x + 1; }
```
Would generate 3 versions of `foo`: (1) `foo.default`, (2)
`foo.cpu_power8`, and (3) `foo.cpu_power9`,
an IFUNC `foo`, and the resolver function `foo.resolver`, for the IFUNC,
that chooses one of the three versions at runtime.
---------
Co-authored-by: Wael Yehia <wyehia@ca.ibm.com>
* [X86] Support reserving EDI on x86-32 (#186123)
Which is under discussion in
https://github.com/llvm/llvm-project/issues/179036.
x86-64 support is added in
https://github.com/llvm/llvm-project/pull/180242.
Now add x86-32 support for reserving EDI via `-ffixed-edi` Update the
X86 backend to respect those reservations in register handling,
callee-save logic, and memcpy/memset lowering, and add driver/codegen
tests.
Add clang driver support for -ffixed-edi and map it to the reserve-edi
target feature on i386.
Teach the X86 backend to treat EDI as a user-reserved register in
register lookup, reserved-register tracking, and callee-save handling,
and avoid selecting REP MOVS/REP STOS when EDI is reserved.
Add driver, Sema, and codegen tests covering option handling, named
global register variables, and the resulting code generation changes.
Signed-off-by: ZhouGuangyuan <zhouguangyuan.xian@gmail.com>
* [clang][RISCV] Add RequiredFeatures for zvknha and zvknhb (#186993)
zvknhb now implies zvknha so we dont need to check extensions manually
in SemaRISCV, we can just use RequiredFeatures instead.
* [orc-rt] Add BootstrapInfo: info for controller session bootstrap. (#187184)
BootstrapInfo holds information needed to bootstrap the ExecutionSession
in the controller. Future patches will update ControllerAccess to send
the bootstrap information at controller-connection time.
BootstrapInfo includes information about the executor process (via
Session::processInfo), an initial set of symbols (replacing
Session::controllerInterface()), and a simple key/value store.
* [clang][Driver][Darwin] Use `xcselect` for `*-apple-darwin*` targets too (#186683)
This is a follow-up to #119670. There, we introduced a CMake option
`CLANG_USE_XCSELECT`, which, when enabled, uses `libxcselect` to find
the right SDK to inject as an `-isysroot` flag when targeting
`*-apple-macos*`.
We intentionally left out `*-apple-darwin*` targets because it broke
many tests. This is unfortunate because `*-apple-darwin*` is the default
triple when building LLVM on macOS, so one isn't able to take advantage
of `xcselect` without an explicit `-target` flag or a change to the
toolchain's default target.
We fix this in two ways.
First, we move the injection of the `-isysroot` flag using `xcselect`
later, until after we are sure that we are targeting macOS. This avoids
confusing the earlier deployment target detection code when we inject
the macOS SDK but actually intended to target non-macOS.
Second, we introduce a `--no-xcselect` flag. This is useful to avoid
breaking some tests that assume clang is invoked without an `-isysroot`
flag pointing at a macOS SDK.
* [clang] Update C++ DR status page
* [libclang/python] Add type annotations to the TranslationUnit class (#180876)
This adds type annotations to the `TranslationUnit` class, enough to
pass a strict typecheck. This resolves 19 strict typing errors as the
next step towards https://github.com/llvm/llvm-project/issues/76664
* [NFC][PowerPC] Pre-commit to optimize bswap64 builtin for power8 (#181776)
The current codegen (for power 8 targets specifically) does not make use
of the parallelism and does most of the operations sequentially.
This will be optimized in a future patch which will follow this NFC PR.
It will enhance the performance and also save us instructions.
---------
Co-authored-by: himadhith <himadhith.v@ibm.com>
* [DA] Fix overflow in symbolic RDIV test (#185805)
The symbolic RDIV test relies on computing the extremes of affine
expressions (e.g., `A1*N1` and `A2*N2`) to disprove dependencies. These
calculations were previously done using `SE->getMulExpr` and
`SE->getMinusSCEV` without guarding against signed integer overflow. If
large coefficients or loop bounds cause a wrap, `isKnownPredicate`
evaluates the wrapped values, potentially disproving a valid dependence
and leading to miscompilations.
This patch reimplements symbolicRDIVtest using `ConstantRange` to work
around overflows.
---------
Signed-off-by: Ruoyu Qiu <cabbaken@outlook.com>
Co-authored-by: Ryotaro Kasuga <kasuga.ryotaro@fujitsu.com>
* [DA] Rewrite formula in the Weak Zero SIV tests (#183738)
This patch rewrites the formula in the Weak Zero SIV tests to match the
one used in the Strong SIV test that was updated in #179665. In this
form, `ConstantRange` is used so we don't need to pay attention to any
corner cases such as overflow.
Fix some test cases that were added in the past PRs to represent the
edge cases.
* [OFFLOAD] Improve handling of synchronization errors in L0 plugin and reenable tests (#186927)
This change improves handling of errors during synchronization in Level
Zero plugin by ensuring cleanup of queues and events in case of an
synchronization error. As a result multiple tests stopped hanging.
---------
Co-authored-by: Duran, Alex <alejandro.duran@intel.com>
* [mlir][x86] Lower packed type vector.contract to AMX dot-product (#182810)
A transform pass to lower `vector.contract` operation to (a)
`amx.tile_mulf` for BF16, or (b) `amx.tile_muli` for Int8 packed types.
* [clang] Reshuffle compiler options in C++ DR tests
This patch changes the order of compiler options on RUN lines so that options that differ in length (like -verify with its multiple prefixes) are at the end. This way it's much easier to see what is common and what is different between RUN lines
* [orc-rt] De-duplicate some test helper APIs. (#187187)
Moves noErrors, mockExecutorProcessInfo, and NoDispatcher into
CommonTestUtils.h where they can be re-used between tests.
* [BOLT] Enable compatibility of instrumentation-file-append-pid with instrumentation-sleep-time (#183919)
This commit enables compatibility of instrumentation-file-append-pid and
instrumentation-sleep-time options. It also requires keeping the
counters mapping between the watcher process and the instrumented binary
process in shared mode. This is useful when we instrument a shared
library that is used by several tasks running on the target system. In
case when we cannot wait for every task to complete, we must use the
sleep-time option. Without append-pid option, we would overwrite the
profile at the same path but collected from different tasks, leading to
unexpected or suboptimal optimization effects.
Co-authored-by: Vasily Leonenko <vasily.leonenko@huawei.com>
* [orc-rt] Publish controller interface from SimpleNativeMemoryMap ctor. (#187198)
Add named constructors to SimpleNativeMemoryMap to publish
SimpleNativeMemoryMap's controller interface when an instance is
constructed.
This supports correct setup by construction, since API clients can't
forget to publish the interface that the controller will need to
interact with the SimpleNativeMemoryMap object.
* [CodeGen] Use separate MBB number for analyses (#187086)
Block numbers are updated too frequently, which makes it difficult to
keep analyses up to date. Therefore, introduce a second number per basic
block that is used for analyses and is renumbered less often. This frees
analyses from providing somewhat efficient facilities for dealing with
changed block numbers, making it simpler to implement in e.g. LoopInfo
or CycleInfo.
(Currently, "less often" means not at all, but we might want to renumber
after certain passes if the numbering gets too sparse and no analyses
are preserved anyway.)
When we introduced a more general use of block numbers some time ago,
using the existing numbers seemed to be a somewhat obvious choice, but I
now think that this was a bad decision, as it conflates a number that is
used for ordering with a number that should be more stable.
MachineBasicBlock isn't particularly size-optimized and there's a fair
amount of padding where we can add another number.
There should be no performance impact,
* [CodeGen] Improve `getLoadExtAction` and friends (#181104)
Alternative approach to the same goals as #162407
This takes `TargetLoweringBase::getLoadExtAction`, renames it to
`TargetLoweringBase::getLoadAction`, merges `getAtomicLoadExtAction`
into it, and adds more inputs for relavent information (alignment,
address space).
The `isLoadExtLegal[OrCustom]` helpers are also modified in a matching
manner.
This is fully backwards compatible, with the existing `setLoadExtAction`
working as before. But this allows targets to override a new hook to
allow the query to make more use of the information. The hook
`getCustomLoadAction` is called with all the parameters whenever the
table lookup yields `LegalizeAction::Custom`, and can return any other
action it wants.
* [orc-rt] Capture a Session& in SimpleNativeMemoryMap, fix TODOs. (#187200)
SimpleNativeMemoryMap now captures a reference to the Session that it
was constructed for. This is used to fix some outstanding TODOs: using
the real page size for the process, and reporting errors that were
previously discarded.
* [lldb-dap] Allow expressions in setVariable request (#185611)
This paths allows expressions in `setVariable` request. It is small
extension of original semantics from DAP specification. DAP has
`setExpression` request to this purpose, but it is too general. So I
prefer to keep this simple solution.
* [AMDGPU] DPP implementations for Wave Reduction (#185814)
Adding DPP reduction support for i32 types.
Supported Ops: `umin`, `min`, `umax`, `max`,
`add`, `sub`, `and`, `or`, `xor`.
* [NFC][PowerPC] Update check lines to include power 9 label (#187193)
The current check lines do not provide a clear distinction between
`power 9` and `power 8` as power 8 label was introduced recently through
#181776. Added `power-9` label to the RUN lines to make it more readable
and understandable.
Co-authored-by: himadhith <himadhith.v@ibm.com>
* [orc-rt] Add TaskGroup for tracking completion of a set of tasks. (#187205)
TaskGroup provides a mechanism for tracking execution of multiple
concurrent tasks and receiving notification when all tasks have
completed. This is useful for coordinating asynchronous operations in
the ORC runtime.
TaskGroup::Token is an RAII handle representing participation in a
group. The group cannot complete while any valid (non-default) Token
exists.
TaskGroup::addOnComplete registers callbacks to run when the group
closes and all tokens are released. (Callbacks registered after
completion run immediately).
TalkGroup::close seals the group: no new tokens can be acquired after
close is called.
All methods may be called concurrently from multiple threads.
* [DA] Add test that represents an edge case for the Exact SIV test (NFC) (#186389)
To prevent a regression that could be caused by #186388.
* [AArch64] Remove vector REV16, use BSWAP instead (#186414)
This removes the generation of vector REV16 nodes, generating a bswap
instead. This allows us to remove most uses of AArch64ISD::REV16 and all
uses of G_REV16.
* [DA] Regenerate assertions for the tests (NFC) (#187207)
Delete the tailing space introduced in #185805 that is noisy when using
UTC.
* [lldb] Avoid permission issue in API test with SHARED_BUILD_TESTCASE (#187072)
Deleting the inferior binary after an API test-case causes issues on one
of the Windows bots. The previous the fix attempt in ca15db1cd509c236
didn't succeed. We have to use isolated subfolders for each test-case.
This is achieved easily by disabling SHARED_BUILD_TESTCASE.
* [atomicrmw] fminimumnum/fmaximumnum support (#187030)
Adds support for `atomicrmw` `fminimumnum`/`fmaximumnum` operations.
These were added to C++ in P3008, and are exposed in libc++ in #186716 .
Adding LLVM IR support for these unblocks work in both backends with HW
support, and frontends.
* [X86] Remove extranous I in comment. NFC (#187209)
Seems to have slipped in in c63d2953a08b9
* [AArch64][GlobalISel] Fix uqadd/sub with scalar operands (#186999)
Previously, neon uqadd/uqsub would not lower when given s32/s64
operands, as GlobalISel would wrongly try to put the operands on
general-purpose register banks. Changing this in RegBankSelection allows
the intrinsics to lower just like their signed versions.
* [VPlan] Use auto return in VPlanPatternMatch (NFC) (#187210)
* [NFCI] [Serialization] Deduplicate DeclID properly (#187212)
In the original code, the operation to iterate Found is meaningless, as
it is guarded by Found.empty(). So this is always a noop for 10 years.
According to the context, I believe the intention is to avoid duplicated
DeclID to be in Data. So changing iterating Found to Data.
Just found by looking around the code.
This is not strictly NFC but NFC intentionally. I will be surprised if
this breaks anything.
* [llvm][utils] Give git-llvm-push u+x permissions (#187211)
There's a hashbang at the top of the script so I presume the intention
is that it can be executed directly, but it seems to be lacking
executable permissions. This sets the user executable bit so running
./llvm/utils/git-llvm-push works
* [mlir][vector] Extend vector.gather e2e test (#187071)
Extend the vector.gather e2e test to cover both available lowering
paths:
* Direct lowering to LLVM (via -test-lower-to-llvm)
* Lowering via vector.load (via -test-vector-gather-lowering)
This is a follow-up to https://github.com/llvm/llvm-project/pull/184706,
which updated a pattern used by -test-vector-gather-lowering.
The test is extended to operate on 2D memrefs so that the changes
in https://github.com/llvm/llvm-project/pull/184706 are meaningfully
exercised.
* [X86] Improve handling of i512 SRA(MSB,Amt) "highbits" mask creation (#187141)
This can be folded from ((1 << 511) >>s Amt) -> (-1 << (511-Amt)) to make use of the existing optimal codegen
Alive2: https://alive2.llvm.org/ce/z/9UMQkm
Last i512 pattern described in #132601
* [Instrumentation][nsan] Add maximumnum to NSAN (#186345)
Add support for the min/maximumnum intrinsics and the corresponding
libfuncs to the NumericalStabilitySanitizer.
* [VPlan] Improve code in VPlanRecipes using VPlanPatternMatch (NFC) (#187130)
* [lldb] Skip file cleanup to avoid permission issue in API test (#187227)
Deleting anything in the build directory of a test-case is causing an
issue on one of the Windows bots. After the previous attempts in
ca15db1cd509c236cd8138bcd098117d0106db56 and
fdd2437af3cdc6d5fe199fcc9d991ccf503b55bd didn't help, we now skip the
file cleanup altogether.
* [mlir][tosa][tosa-to-linalg] Fix rescale with double rounding failing validation (#184787)
The validation pass added attribute checks on rescale rounding mode, but
the tosa-to-linalg-pipeline did not specify support for the doubleround
extension, causing rescale with doubleround to be rejected by the
validation in the tosa-to-linalg-pipeline.
One method of fixing this would be to only enable the attribute checks
when the "strictOpSpecAlignment" validation option is enabled. However,
I feel this is the wrong direction of travel. Long-term it would be nice
if the tosa-to-linalg-pipeline specified all the extensions it supports,
gracefully rejecting operations that require unsupported extensions.
Therefore, this change declares support for the doubleround extension to
fix the legalization failure with the ambition of adding more extensions
in the future.
* [LifetimeSafety] Track origins through array subscript and array-to-pointer decay (#186902)
Array element accesses and array-to-pointer decay were not tracked
because `CK_ArrayToPointerDecay` dropped origins and
`ArraySubscriptExpr` had no visitor. This patch adds both to propagate
origins through array operations.
Fixes #186075
* [mlir][spirv] add ExecutionModeIdOp (#186241)
Adds OpExecutionModeId from spirv 1.2
---------
Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
* [clang] Enable '-verify-directives' mode in C++ DR tests (#187219)
This patch enables recently implemented `-verify-directives` mode
(#179835) in C++ DR tests to automate some of the work I've been doing
manually while reviewing PRs touching those tests. As highlighted in
that PR, all the errors this mode found were addressed in #179813 and
#179674, so this PR just flips the switch.
* [DAG] Add back SelectionDAG::dump() without parameter (#187001)
Usually `dump()`s are without parameter, so the practice is calling
`XXX::dump()` when debugging.
But we will get an error like below after #161097:
```
error: <user expression 128>:1:10: too few arguments to function call,
expected 1, have 0
1 | DAG.dump()
| ~~~~~~~~ ^
```
So to not surprise users, I added back the `SelectionDAG::dump()`
without parameter.
* [PowerPC] Preserve load output chain in vcmpequb combine (#187010)
Replace uses of the old load output chain with the new load output
chain. A plain replacement here is fine because the transform verifies
the load is one-use.
Fixes https://github.com/llvm/llvm-project/issues/186549.
* [orc-rt] Add ShutdownRequested flag to Service::onDetach. (#187230)
The ShutdownRequested flag indicates to Services whether a shutdown
operation is already pending. Services may use this information to
optimize their book-keeping: either preparing for a (potentially
lengthy) detached state, or for an upcoming shutdown.
Session does not call onDetached yet: That (including setting the
ShutdownRequested argument) will happen in a follow-up patch.
* [Frontend/OpenMP][NFC] Drop uses of BranchInst (#186393)
In OpenMPIRBuilder::EmitOMPInlinedRegion there are two checks w.r.t.
SplitPos, which appear to be always true. I conservatively left the code
as-is.
* [mlir][reducer] Add eraseAllOpsInRegion function to reduction-tree pass (#185892)
Added logic to erase all operations within a region. This addresses
scenarios where the test script always returns 1 (interesting), in which
case the simplest output from mlir-reduce should be an empty ModuleOp.
* [orc-rt] Rename Session setController/detachFromController. NFC. (#187235)
These methods are renamed to attach and detach for simplicity.
* [LV] Add test for diff checks with ptrtoint subtract. (NFC)
Adds extra test coverage for
https://github.com/llvm/llvm-project/pull/180244.
* [NFC][AArch64] add tests for `is_fpclass` (#187231)
Preparation for https://github.com/llvm/llvm-project/pull/169402
* [orc-rt] Move CallViaSession into Session, add comments. (#187238)
Makes CallViaSession an inner class on Session, and adds comments and a
convenience method for creating instances.
* [CycleInfo] Use block numbers for dfs numbering (NFC) (#187062)
Store the DFSInfo into a vector indexed by block number instead of a map
using the block pointer.
This is a small compile-time improvement for CycleInfo construction.
* [AArch64] Use SVE/NEON FMLAL top/bottom instructions (#186798)
* [mlir][vector] Use non-native runner in gather.mlir test (#187243)
Fix after https://github.com/llvm/llvm-project/pull/187071
* [Bazel] Fixes c1f6fd2 (#187146)
This fixes c1f6fd24aa637d6aadb72aa08bf3d8a14c961ed2.
* [Clang][NFC] Drop uses of BranchInst (#187242)
* [AArch64][llvm] Make SBZ/SBO insns warn not fail when disassembling (#187068)
Some instructions in the Arm ARM have bits which are marked as "Should
Be One" or "Should Be Zero", and they're marked as "Constrained
Unpredictable" as to what should happen if they're not.
This is to improve hardware decode efficiency. In all the cases where
this occurs, it's an instruction which in all other respects is closely
related to an adjacent instruction in the encoding space (for example a
similar load or store) but doesn't require one of the variable fields,
usually a register field. These fields are then defined as SBZ or SBO.
If one of these instructions didn't have the bits set to SBZ or SBO,
then the instruction would fail to disassemble. We had missed adding
`Unpredictable` to a few of these, and they would fail rather than
warn.
Update these AArch64 instructions to treat `Unpredictable` bitfields as
soft-fails with a warning, and add a comprehensive disassembler
regression
test that checks all instructions which use `Unpredictable`.
The new tests check that `llvm-mc` emits a "potentially undefined
instruction encoding" warning, and verifies that the disassembler still
produces the canonical instruction spelling and encoding.
This keeps the behaviour consistent for users across SBO and SBZ
style reserved fields instead of rejecting encodings that should
disassemble with a warning.
* [OpenMP] Map const-qualified target map variables as 'to'. (#185918)
This patch updates the mapping kind for const-qualified variables
from`tofrom` to `to`, ensuring correct and standards-compliant mapping
semantics for const variables.
* [lldb] Do not use mkdir -p in makefile on Windows (#187244)
`Make` uses systems's `mkdir` on Windows even if Git's mkdir.exe is
present in PATH. Windows's mkdir does not support the parameter `-p` and
creates the directory `-p` instead. Few other tests also use `mkdir -p`
but they are linux, objc or macosx related.
---------
Co-authored-by: Charles Zablit <c_zablit@apple.com>
* [clangd] Support suppressions for driver diagnostics (#182912)
Rebase of https://reviews.llvm.org/D127844
Fixes [#1142](https://github.com/clangd/clangd/issues/1142)
* [flang][OpenMP] Use OmpDirectiveSpecification for range/depth queries, NFC (#187109)
That makes them usable for a potential future implementation of APPLY.
* libclc: Use select function instead of ?: for some fp selects (#187253)
It seems that ?: is not quite equivalent to select for floating-point
vectors. With ?:, the resulting IR involves integer bitcasts and
integer vector typed select. Use select so this is an fp-select. This
enables finite math only contexts to optimize out the select.
This feels like it's a clang bug though.
* libclc: Clean up sincos macro usage (#187260)
Handle this more like fract, and implement other
address spaces on top of the private overload with
a temporary variable.
* [mlir][gpu] Fix typo in documentation (#156619)
* libclc: Improve float trig function handling (#187264)
Most of this was originally ported from rocm device libs in
c0ab2f81e3ab5c7a4c2e0b812a873c3a7f9dca8b, so merge
in more recent changes.
* Fix MSVC "not all control paths return a value" warning. NFC. (#187265)
* [CIR] Fix bug where block after-unreachable wasn't CXXABILowered (#186869)
If a TU has an 'unreachable' block, it wouldn't be CXXABILower'ed, which
would cause a legalization failure. This patch adds the same solution we
do in LowerToLLVM, which is to make sure we transform those sections
separately.
* [CIR][NFC] Split the CXXABI 'TypeConverter' into its own type. (#186874)
This is currently an NFC change, as the CXXABITypeConverter has no
members yet. This patch splits it off into its own type, as it is going
to need to have members when we start transforming record types, but
doesn't implement that part yet (coming in future patches).
* [CIR][NFC] Unify the 'null data member attr' getters (#186876)
In preperation of actually lowering data members as fields to a record
type, this patch does a minor refactor to make their single current use
have a slightly simpler interface. This will prevent us from having to
copy/paste this later.
Also, this patch removes a pair of now-orphaned builders, instead
preferring to use the ones that come from the parent builder type.
* [CIR] Add support for arrays-of-pointer-to-member-data (#186887)
This patch adds support for arrays of pointer-to-member-data, just like
we do for pointer-to-member-function. This patch also does a refactor of
some basic value lowering, which both makes this apply to locals and
constants, but also unifies them in preperation of future work when it
comes to record types.
Other than the otherwise-not-quite-intentional change (the recursion got
this feature, and I realized it worked while looking at other
things!), this is NFCI.
* [libc][math] Improve hypotf performance. (#186627)
Update the check for when a more careful rounding is needed, and remove
the redundant clear exception step.
* [OpenMP] Use the standard fences now that they are supported (#187138)
Summary:
We can now use these on NVPTX properly.
* [NewPM] Add port for AArch64DeadRegisterDefinitionsPass (#187180)
Adds a newPM pass for AArch64DeadRegisterDefinitions
* Refactors base logic into an Impl class
* Renames old pass with the "Legacy" suffix
* Adds the new pass manager pass using refactored logic
No existing `.mir` tests to update.
Context and motivation in
https://llvm.org/docs/NewPassManager.html#status-of-the-new-and-legacy-pass-managers
* [libc][math] Fix missing underflow exception in DyadicFloat::generic_as (#186734)
The `generic_as` function in `dyadic_float.h` had a missing `underflow =
true` at the exact boundary where `unbiased_exp == -EXP_BIAS -
FRACTION_LEN`.
At this boundary, the mantissa MSB maps exactly to the round bit, so
out_mantissa is 0 and the result can only be 0 or min_subnormal. The
value is at most min_subnormal / 2, so it is always tiny and always
inexact `underflow` must be signaled. The < case and the general
subnormal range both already set underflow = true this boundary case was
the only gap.
this specifically fix this error in the erfcf16 function
```
Extracted erfcf16.cpp.o from archive for linking
Running exhaustive check in --rndn mode...
Missing underflow exception for x=0x1.eacp+1 (y=0x1p-24)
```
this fix may also apply to other bfloat16 missing exceptions
(@Sukumarsawant)
part of: #186483
CC: @lntue
* [flang] Fix the CHECK: directive to ensure flagging RWE (NFC) (#187186)
Update the check to catch "RWE" in the header.
* [gn] port c1f6fd24aa637d6a
* [compiler-rt] Update runtime build script to detect RPC XDR header for AIX (#186977)
`sanitizer_common` and its tests depend on the RPC XDR header for layout
compatibility. When this header is absent from a CI or build
environment, changes that silently break the expected struct layout go
undetected, since there is nothing to fail the build.
The default is opt-in — error on missing header is on by default for AIX
(where the dependency is known and the package is `bos.net.nfs.adt`) and
off by default elsewhere.
Changes:
1. On AIX, checks for `tirpc/rpc/xdr.h`; on all other platforms, checks
for `rpc/xdr.h`
2. Introduces `COMPILER_RT_REQUIRE_RPC_XDR_H` CMake option (default ON
on AIX, OFF elsewhere) that, when set, turns a missing header into a
fatal configuration error with an actionable message
3. Drive-by fix: Normalizes `HAVE_RPC_XDR_H` to 0 when the header is
absent, for consistent downstream `if()/#cmakedefine` behavior
* [gn build] Port 45fe4bbdde13
* [gn build] Port 55db533b74fe
* [gn build] Port 681f1a5ee987
* [gn build] Port 6b3cf50d958c
* [gn build] Port d0d0a665c238
* [gn build] Port e4a2d9cd8a63
* [SLP] Loop aware cost model/tree building
Currently, SLP vectorizer do not care about loops and their trip count.
It may lead to inefficient vectorization in some cases. Patch adds loop
nest-aware tree building and cost estimation.
When it comes to tree building, it now checks that tree do not span
across different loop nests. The nodes from other loop nests are
immediate buildvector nodes.
The cost model adds the knowledge about loop trip count. If it is
unknown, the default value is used, controlled by the
-slp-cost-loop-min-trip-count=<value> option. The cost of the vector
nodes in the loop is multiplied by the number of iteration (trip count),
because each vector node will be executed the trip count number of
times. This allows better cost estimation.
Reviewers: jdenny-ornl, vporpo, hiraditya, RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/150450
Recommit after revert in c7bd3062f1dac975cf9b706f457b3c55b4bf57ff
* [NFC][AArch64] fix triple used in test (#187275)
cc @dyung https://github.com/llvm/llvm-project/pull/187231
* [mlir][acc] Move acc routine functions into GPU module (#187161)
The OpenACC routine directive defines functions that may be called from
device code; those functions (and any device-required callees) must be
present in the device compilation unit. This PR introduces
ACCRoutineToGPUFunc pass which moves materialized acc routines into the
GPU module as gpu.func so they can be compiled for the device.
This adds testing showing the pass on both MLIR and FIR. The FIR tests
required improvements in OpenACCSupport implementation to ensure that
CUF and Fortran runtime is considered as legal for GPU.
* [Offload] Add CMake alias for CI (#186099)
In the pre-merge CI we need a top-level visible target that can be used
to build offload, i.e., libomptarget and LLVMOffload.
The related PR to include offload into pre-merge CI is here:
https://github.com/llvm/llvm-project/pull/174955
* [InstCombine] RAUW for proven zero-indexed GEPs rather than cloning for a specific user (#185053)
When analyzing operands of loads/stores, if we can guarantee that a GEP
is always zero-indexed, it is better to modify the GEP such that other
users can take advantage of the simplification, rather than just cloning
it for one specific load/store user. Edit: implementation changed to
call replaceInstUsesWith instead of modifying in place.
Without this change, replaceGEPIdxWithZero clones the GEP for the
triggering load/store, leaving the original variable-indexed GEP in
place. Other users of that GEP (e.g., a constant-offset GEP feeding a
second load) miss the simplification. Testcase demonstrates this:
without the first load _modifying_ the gep, the _second_ load will still
be dependent on both GEPs, and thus unnecessarily dependent on the %idx.
This lack of simplification can cause issues with later passes such as
LICM.
Alternative approaches could be to add a version of this transform into
visitGEP, but there is precedent to doing so in visitLoad/visitStore,
see simplifyNonNullOperand. And because the optimization is tied to the
dereference that happens in the load/store, I think it reasonably fits
here.
Alive2 proof: https://alive2.llvm.org/ce/z/-HZd9c
Alive2 counterexample showing why we cannot blindly modify the gep in
place without some sort of condition:
https://alive2.llvm.org/ce/z/dzKuc3
* [libc++] Refactor __is_transparent_v to make it clear what it depends on (#186419)
__is_transparent_v used to accept an additional _Key template argument
whose sole purpose …
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Part of #147386
in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450