From 4c48b3cb5c681b34345e0dffe920649a5c26aaf4 Mon Sep 17 00:00:00 2001 From: Dhruv Chawla Date: Thu, 23 May 2024 13:45:52 +0530 Subject: [PATCH 001/433] [GISel][CombinerHelper] Push freeze through non-poison-producing operands (#90618) This combine matches the existing fold in InstCombine, i.e. InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating. It tries to push freeze through an operand if the operand has only one maybe-poison operand and all other operands are guaranteed non-poison, and if the operation itself cannot generate poison (eg. add with nsw can generate poison, even with non-poison operands). This is beneficial because it can potentially enable other optimizations to occur that would otherwise be blocked because of the freeze. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + .../CodeGen/GlobalISel/GenericMachineInstrs.h | 11 ++++ llvm/include/llvm/CodeGen/MachineInstr.h | 6 ++ .../include/llvm/Target/GlobalISel/Combine.td | 10 ++- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 64 +++++++++++++++++++ llvm/lib/CodeGen/GlobalISel/Utils.cpp | 24 ++++++- llvm/lib/Target/AArch64/AArch64Combine.td | 3 +- .../AArch64/GlobalISel/combine-select.mir | 42 ++++++------ llvm/test/CodeGen/AArch64/pr58431.ll | 4 +- 9 files changed, 139 insertions(+), 28 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index a9a33c7617d7..2111e82e1a99 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -869,6 +869,9 @@ class CombinerHelper { /// Combine insert vector element OOB. bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo); + bool matchFreezeOfSingleMaybePoisonOperand(MachineInstr &MI, + BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 2a3145b635e6..2b3efc3b609f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -34,6 +34,17 @@ class GenericMachineInstr : public MachineInstr { static bool classof(const MachineInstr *MI) { return isPreISelGenericOpcode(MI->getOpcode()); } + + bool hasPoisonGeneratingFlags() const { + return getFlags() & (NoUWrap | NoSWrap | IsExact | Disjoint | NonNeg | + FmNoNans | FmNoInfs); + } + + void dropPoisonGeneratingFlags() { + clearFlags(NoUWrap | NoSWrap | IsExact | Disjoint | NonNeg | FmNoNans | + FmNoInfs); + assert(!hasPoisonGeneratingFlags()); + } }; /// Provides common memory operand functionality. diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 2b0c5d166d88..db48a0ae5514 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -416,6 +416,12 @@ class MachineInstr Flags &= ~((uint32_t)Flag); } + void clearFlags(unsigned flags) { + assert(isUInt(flags) && + "flags to be cleared are out of range for the Flags field"); + Flags &= ~flags; + } + /// Return true if MI is in a bundle (but not the first MI in a bundle). /// /// A bundle looks like this before it's finalized: diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 5d4b5a2479f6..34698f195615 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -220,6 +220,13 @@ def idempotent_prop : GICombineRule< (match (idempotent_prop_frags $dst, $src)), (apply (GIReplaceReg $dst, $src))>; +// Convert freeze(Op(Op0, NonPoisonOps...)) to Op(freeze(Op0), NonPoisonOps...) +// when Op0 is not guaranteed non-poison +def push_freeze_to_prevent_poison_from_propagating : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_FREEZE $dst, $src):$root, + [{ return !isGuaranteedNotToBePoison(${src}.getReg(), MRI) && Helper.matchFreezeOfSingleMaybePoisonOperand(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; def extending_loads : GICombineRule< (defs root:$root, extending_load_matchdata:$matchinfo), @@ -1713,7 +1720,8 @@ def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines, sub_add_reg, select_to_minmax, redundant_binop_in_equality, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, combine_concat_vector, double_icmp_zero_and_or_combine, match_addos, - sext_trunc, zext_trunc, combine_shuffle_concat]>; + sext_trunc, zext_trunc, combine_shuffle_concat, + push_freeze_to_prevent_poison_from_propagating]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 22eb4a3e0d7c..4cc602b5c870 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -223,6 +223,70 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) { replaceRegWith(MRI, DstReg, SrcReg); } +bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand( + MachineInstr &MI, BuildFnTy &MatchInfo) { + // Ported from InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating. + Register DstOp = MI.getOperand(0).getReg(); + Register OrigOp = MI.getOperand(1).getReg(); + + if (!MRI.hasOneNonDBGUse(OrigOp)) + return false; + + MachineInstr *OrigDef = MRI.getUniqueVRegDef(OrigOp); + // Even if only a single operand of the PHI is not guaranteed non-poison, + // moving freeze() backwards across a PHI can cause optimization issues for + // other users of that operand. + // + // Moving freeze() from one of the output registers of a G_UNMERGE_VALUES to + // the source register is unprofitable because it makes the freeze() more + // strict than is necessary (it would affect the whole register instead of + // just the subreg being frozen). + if (OrigDef->isPHI() || isa(OrigDef)) + return false; + + if (canCreateUndefOrPoison(OrigOp, MRI, + /*ConsiderFlagsAndMetadata=*/false)) + return false; + + std::optional MaybePoisonOperand; + for (MachineOperand &Operand : OrigDef->uses()) { + if (!Operand.isReg()) + return false; + + if (isGuaranteedNotToBeUndefOrPoison(Operand.getReg(), MRI)) + continue; + + if (!MaybePoisonOperand) + MaybePoisonOperand = Operand; + else { + // We have more than one maybe-poison operand. Moving the freeze is + // unsafe. + return false; + } + } + + cast(OrigDef)->dropPoisonGeneratingFlags(); + + // Eliminate freeze if all operands are guaranteed non-poison. + if (!MaybePoisonOperand) { + MatchInfo = [=](MachineIRBuilder &B) { MRI.replaceRegWith(DstOp, OrigOp); }; + return true; + } + + Register MaybePoisonOperandReg = MaybePoisonOperand->getReg(); + LLT MaybePoisonOperandRegTy = MRI.getType(MaybePoisonOperandReg); + + MatchInfo = [=](MachineIRBuilder &B) mutable { + B.setInsertPt(*OrigDef->getParent(), OrigDef->getIterator()); + auto Freeze = B.buildFreeze(MaybePoisonOperandRegTy, MaybePoisonOperandReg); + replaceRegOpWith( + MRI, *OrigDef->findRegisterUseOperand(MaybePoisonOperandReg, TRI), + Freeze.getReg(0)); + replaceRegWith(MRI, DstOp, OrigOp); + }; + return true; +} + bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, SmallVector &Ops) { assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS && diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index cd5dc0e01ed0..f455482e0294 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1745,11 +1745,20 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI, UndefPoisonKind Kind) { MachineInstr *RegDef = MRI.getVRegDef(Reg); + if (auto *GMI = dyn_cast(RegDef)) { + if (ConsiderFlagsAndMetadata && includesPoison(Kind) && + GMI->hasPoisonGeneratingFlags()) + return true; + } else { + // Conservatively return true. + return true; + } + switch (RegDef->getOpcode()) { case TargetOpcode::G_FREEZE: return false; default: - return true; + return !isa(RegDef) && !isa(RegDef); } } @@ -1767,8 +1776,17 @@ static bool isGuaranteedNotToBeUndefOrPoison(Register Reg, return true; case TargetOpcode::G_IMPLICIT_DEF: return !includesUndef(Kind); - default: - return false; + default: { + auto MOCheck = [&](const MachineOperand &MO) { + if (!MO.isReg()) + return true; + return ::isGuaranteedNotToBeUndefOrPoison(MO.getReg(), MRI, Depth + 1, + Kind); + }; + return !::canCreateUndefOrPoison(Reg, MRI, + /*ConsiderFlagsAndMetadata=*/true, Kind) && + all_of(RegDef->uses(), MOCheck); + } } } diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 10cad6d19244..1c7f6b870d39 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -295,5 +295,6 @@ def AArch64PostLegalizerCombiner ptr_add_immed_chain, overlapping_and, split_store_zero_128, undef_combines, select_to_minmax, or_to_bsp, combine_concat_vector, - commute_constant_to_rhs]> { + commute_constant_to_rhs, + push_freeze_to_prevent_poison_from_propagating]> { } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir index 353c1550d697..074d4ecbd878 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir @@ -117,9 +117,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2 ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f - ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64) + ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1) ; CHECK-NEXT: $w0 = COPY %ext(s32) %0:_(s64) = COPY $x0 @@ -144,9 +144,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2 ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f - ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64) + ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1) ; CHECK-NEXT: $w0 = COPY %ext(s32) %0:_(s64) = COPY $x0 @@ -172,9 +172,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d2 ; CHECK-NEXT: %c:_(<2 x s1>) = G_TRUNC [[COPY]](<2 x s32>) - ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[COPY1]](<2 x s32>) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s1>) = G_FREEZE %f - ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, [[FREEZE]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s32>) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[FREEZE]](<2 x s32>) + ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, %f ; CHECK-NEXT: %ext:_(<2 x s32>) = G_ANYEXT %sel(<2 x s1>) ; CHECK-NEXT: $d0 = COPY %ext(<2 x s32>) %0:_(<2 x s32>) = COPY $d0 @@ -201,9 +201,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t - ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64) + ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1) ; CHECK-NEXT: $w0 = COPY %ext(s32) %0:_(s64) = COPY $x0 @@ -229,9 +229,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t - ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]] + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64) + ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1) ; CHECK-NEXT: $w0 = COPY %ext(s32) %0:_(s64) = COPY $x0 @@ -257,11 +257,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64) ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t - ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], [[FREEZE]] + ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], %t ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1) ; CHECK-NEXT: $w0 = COPY %ext(s32) %0:_(s64) = COPY $x0 @@ -287,11 +287,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2 ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64) - ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]] + ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]] - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f - ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], [[FREEZE]] + ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], %f ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1) ; CHECK-NEXT: $w0 = COPY %ext(s32) %0:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll index dcd97597ae40..e87d8f7874d6 100644 --- a/llvm/test/CodeGen/AArch64/pr58431.ll +++ b/llvm/test/CodeGen/AArch64/pr58431.ll @@ -4,8 +4,8 @@ define i32 @f(i64 %0) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 -; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: mov w8, #10 // =0xa +; CHECK-NEXT: and x9, x0, #0xffffffff ; CHECK-NEXT: udiv x10, x9, x8 ; CHECK-NEXT: msub x0, x10, x8, x9 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 From 1a2a0c0dc9aab6f440033f36ff2323685080f46a Mon Sep 17 00:00:00 2001 From: Anchu Rajendran S Date: Thu, 23 May 2024 13:46:35 +0530 Subject: [PATCH 002/433] Fixing the location attribute added to mapInfoOp (#90764) Named location attribute added to `tgt_offload_entry` shall be used by runtime calls like `ompx_dump_mapping_tables` to print the information of variables that are mapped to the device. `ompx_dump_mapping_tables` was printing the wrong location information and this change fixes it. A sample execution of example before the change: ``` omptarget device 0 info: OpenMP Host-Device pointer mappings after block at libomptarget:0:0: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration omptarget device 0 info: 0x0000000000206df0 0x00007f02cdc00000 20000000 1 0 at unknown:18:35 ``` The change replaces unknown to the mapped symbol and location to the declaration location. --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 5 ++- flang/lib/Lower/OpenMP/OpenMP.cpp | 7 +++- .../offloading/fortran/dump_map_tables.f90 | 38 +++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 offload/test/offloading/fortran/dump_map_tables.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 875599098b3d..68619f699ebb 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -882,8 +882,11 @@ bool ClauseProcessor::processMap( // Explicit map captures are captured ByRef by default, // optimisation passes may alter this to ByCopy or other capture // types to optimise + auto location = mlir::NameLoc::get( + mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()), + symAddr.getLoc()); mlir::omp::MapInfoOp mapOp = createMapInfoOp( - firOpBuilder, clauseLocation, symAddr, + firOpBuilder, location, symAddr, /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, static_cast< diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 17b362cc2f32..1569605e785b 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1604,9 +1604,12 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; } - + auto location = + mlir::NameLoc::get(mlir::StringAttr::get(firOpBuilder.getContext(), + sym.name().ToString()), + baseOp.getLoc()); mlir::Value mapOp = createMapInfoOp( - firOpBuilder, baseOp.getLoc(), baseOp, /*varPtrPtr=*/mlir::Value{}, + firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, static_cast< diff --git a/offload/test/offloading/fortran/dump_map_tables.f90 b/offload/test/offloading/fortran/dump_map_tables.f90 new file mode 100644 index 000000000000..cb66ef348e3c --- /dev/null +++ b/offload/test/offloading/fortran/dump_map_tables.f90 @@ -0,0 +1,38 @@ +! Offloading test with runtine call to ompx_dump_mapping_tables +! Fortran array writing some values and printing the variable mapped to device +! correctly receives the updates made on the device. +! REQUIRES: flang +! UNSUPPORTED: nvptx64-nvidia-cuda-LTO +! UNSUPPORTED: aarch64-unknown-linux-gnu +! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +! UNSUPPORTED: x86_64-pc-linux-gnu +! UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +! RUN: %libomptarget-compile-fortran-run-and-check-generic + +program map_dump_example + INTERFACE + SUBROUTINE ompx_dump_mapping_tables() BIND(C) + END SUBROUTINE ompx_dump_mapping_tables + END INTERFACE + + integer i,j,k,N + integer async_q(4) + real :: A(5000000) + N=5000000 + do i=1, N + A(i)=0 + enddo +! clang-format off +! CHECK: omptarget device 0 info: OpenMP Host-Device pointer mappings after block +! CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration +! CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} 20000000 1 0 {{.*}} at a(:n):21:11 +! clang-format on +!$omp target enter data map(to:A(:N)) + call ompx_dump_mapping_tables() +!$omp target parallel do + do i=1, N + A(i)=A(i)*2 + enddo +!$omp target exit data map(from:A) +end program From 951b13d9a7220d761b1ee0dc09a50b635692ecf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 07:17:53 +0200 Subject: [PATCH 003/433] [clang][Interp][NFC] Save IsUnion bit for Records Now that we call this more often, try to keep pointer chasing to a minimum. --- clang/lib/AST/Interp/Record.cpp | 2 +- clang/lib/AST/Interp/Record.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/Record.cpp b/clang/lib/AST/Interp/Record.cpp index 6a0a28bc9124..8ded765fc1c4 100644 --- a/clang/lib/AST/Interp/Record.cpp +++ b/clang/lib/AST/Interp/Record.cpp @@ -16,7 +16,7 @@ Record::Record(const RecordDecl *Decl, BaseList &&SrcBases, FieldList &&SrcFields, VirtualBaseList &&SrcVirtualBases, unsigned VirtualSize, unsigned BaseSize) : Decl(Decl), Bases(std::move(SrcBases)), Fields(std::move(SrcFields)), - BaseSize(BaseSize), VirtualSize(VirtualSize) { + BaseSize(BaseSize), VirtualSize(VirtualSize), IsUnion(Decl->isUnion()) { for (Base &V : SrcVirtualBases) VirtualBases.push_back({ V.Decl, V.Offset + BaseSize, V.Desc, V.R }); diff --git a/clang/lib/AST/Interp/Record.h b/clang/lib/AST/Interp/Record.h index cf0480b3f62f..83e15b125f77 100644 --- a/clang/lib/AST/Interp/Record.h +++ b/clang/lib/AST/Interp/Record.h @@ -53,7 +53,7 @@ class Record final { /// Returns the name of the underlying declaration. const std::string getName() const; /// Checks if the record is a union. - bool isUnion() const { return getDecl()->isUnion(); } + bool isUnion() const { return IsUnion; } /// Returns the size of the record. unsigned getSize() const { return BaseSize; } /// Returns the full size of the record, including records. @@ -132,6 +132,8 @@ class Record final { unsigned BaseSize; /// Size of all virtual bases. unsigned VirtualSize; + /// If this record is a union. + bool IsUnion; }; } // namespace interp From 335e00faaf74f3f7463b32a415d39af0973f521f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 10:20:42 +0200 Subject: [PATCH 004/433] [clang][Interp][NFC] Add another union test case --- clang/test/AST/Interp/unions.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/clang/test/AST/Interp/unions.cpp b/clang/test/AST/Interp/unions.cpp index bc5604c2b2d0..73e42d57a7b7 100644 --- a/clang/test/AST/Interp/unions.cpp +++ b/clang/test/AST/Interp/unions.cpp @@ -30,3 +30,16 @@ constexpr A ab = {.d = 1.0}; static_assert(ab.d == 1.0, ""); static_assert(ab.a == 1, ""); // both-error {{not an integral constant expression}} \ // both-note {{read of member 'a' of union with active member 'd'}} + +namespace SimpleStore { + union A { + int a; + int b; + }; + constexpr int foo() { + A a{.b = 4}; + a.b = 10; + return a.b; + } + static_assert(foo() == 10, ""); +} From 4cc6d0f4dfb26deb9863901c70258d6d7f0c8ba4 Mon Sep 17 00:00:00 2001 From: Med Ismail Bennani Date: Thu, 23 May 2024 01:25:48 -0700 Subject: [PATCH 005/433] [lldb] Make use of Scripted{Python,}Interface for ScriptedThreadPlan (Reland #70392) (#93149) This patch makes ScriptedThreadPlan conforming to the ScriptedInterface & ScriptedPythonInterface facilities by introducing 2 ScriptedThreadPlanInterface & ScriptedThreadPlanPythonInterface classes. This allows us to get rid of every ScriptedThreadPlan-specific SWIG method and re-use the same affordances as other scripting offordances, like Scripted{Process,Thread,Platform} & OperatingSystem. To do so, this adds new transformer methods for `ThreadPlan`, `Stream` & `Event`, to allow the bijection between C++ objects and their python counterparts. This just re-lands #70392 after fixing test failures. Signed-off-by: Med Ismail Bennani --- lldb/bindings/python/python-swigsafecast.swig | 17 +- lldb/bindings/python/python-wrapper.swig | 157 +++--------------- lldb/include/lldb/API/SBEvent.h | 4 +- lldb/include/lldb/API/SBStream.h | 6 + .../Interfaces/ScriptedInterface.h | 4 +- .../Interfaces/ScriptedThreadPlanInterface.h | 38 +++++ .../lldb/Interpreter/ScriptInterpreter.h | 55 ++---- lldb/include/lldb/Target/ThreadPlanPython.h | 2 + lldb/include/lldb/lldb-forward.h | 3 + lldb/source/Interpreter/ScriptInterpreter.cpp | 13 ++ .../Python/Interfaces/CMakeLists.txt | 1 + .../ScriptedPlatformPythonInterface.cpp | 2 + .../ScriptedProcessPythonInterface.cpp | 27 ++- .../Interfaces/ScriptedPythonInterface.cpp | 34 +++- .../Interfaces/ScriptedPythonInterface.h | 29 +++- .../ScriptedThreadPlanPythonInterface.cpp | 105 ++++++++++++ .../ScriptedThreadPlanPythonInterface.h | 48 ++++++ .../ScriptedThreadPythonInterface.cpp | 28 +++- .../Python/SWIGPythonBridge.h | 22 +-- .../Python/ScriptInterpreterPython.cpp | 122 +------------- .../Python/ScriptInterpreterPythonImpl.h | 28 +--- lldb/source/Target/ThreadPlanPython.cpp | 98 ++++++----- .../functionalities/step_scripted/Steps.py | 4 +- .../thread_plan/wrap_step_over.py | 2 +- .../Python/PythonTestSuite.cpp | 50 +++--- 25 files changed, 468 insertions(+), 431 deletions(-) create mode 100644 lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig index d5ea51487271..34f8c6f0ff8d 100644 --- a/lldb/bindings/python/python-swigsafecast.swig +++ b/lldb/bindings/python/python-swigsafecast.swig @@ -37,10 +37,6 @@ PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) { return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError); } -PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr stream_sb) { - return ToSWIGHelper(stream_sb.release(), SWIGTYPE_p_lldb__SBStream); -} - PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr data_sb) { return ToSWIGHelper(data_sb.release(), SWIGTYPE_p_lldb__SBStructuredData); } @@ -115,9 +111,16 @@ SWIGBridge::ToSWIGWrapper(CommandReturnObject &cmd_retobj) { SWIGTYPE_p_lldb__SBCommandReturnObject); } -ScopedPythonObject SWIGBridge::ToSWIGWrapper(Event *event) { - return ScopedPythonObject(new lldb::SBEvent(event), - SWIGTYPE_p_lldb__SBEvent); +PythonObject SWIGBridge::ToSWIGWrapper(const Stream *s) { + return ToSWIGHelper(new lldb::SBStream(), SWIGTYPE_p_lldb__SBStream); +} + +PythonObject SWIGBridge::ToSWIGWrapper(std::shared_ptr stream_sb) { + return ToSWIGHelper(stream_sb.get(), SWIGTYPE_p_lldb__SBStream); +} + +PythonObject SWIGBridge::ToSWIGWrapper(Event *event) { + return ToSWIGHelper(new lldb::SBEvent(event), SWIGTYPE_p_lldb__SBEvent); } PythonObject SWIGBridge::ToSWIGWrapper( diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 1370afc885d4..28ab58f8ce49 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -229,133 +229,6 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject return pfunc(SWIGBridge::ToSWIGWrapper(std::move(debugger_sp)), dict); } -PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan( - const char *python_class_name, const char *session_dictionary_name, - const lldb_private::StructuredDataImpl &args_impl, - std::string &error_string, const lldb::ThreadPlanSP &thread_plan_sp) { - if (python_class_name == NULL || python_class_name[0] == '\0' || - !session_dictionary_name) - return PythonObject(); - - PyErr_Cleaner py_err_cleaner(true); - - auto dict = PythonModule::MainModule().ResolveName( - session_dictionary_name); - auto pfunc = PythonObject::ResolveNameWithDictionary( - python_class_name, dict); - - if (!pfunc.IsAllocated()) { - error_string.append("could not find script class: "); - error_string.append(python_class_name); - return PythonObject(); - } - - PythonObject tp_arg = SWIGBridge::ToSWIGWrapper(thread_plan_sp); - - llvm::Expected arg_info = pfunc.GetArgInfo(); - if (!arg_info) { - llvm::handleAllErrors( - arg_info.takeError(), - [&](PythonException &E) { error_string.append(E.ReadBacktrace()); }, - [&](const llvm::ErrorInfoBase &E) { - error_string.append(E.message()); - }); - return PythonObject(); - } - - PythonObject result = {}; - auto args_sb = std::unique_ptr(new lldb::SBStructuredData(args_impl)); - if (arg_info.get().max_positional_args == 2) { - if (args_sb->IsValid()) { - error_string.assign( - "args passed, but __init__ does not take an args dictionary"); - return PythonObject(); - } - result = pfunc(tp_arg, dict); - } else if (arg_info.get().max_positional_args >= 3) { - result = pfunc(tp_arg, SWIGBridge::ToSWIGWrapper(std::move(args_sb)), dict); - } else { - error_string.assign("wrong number of arguments in __init__, should be 2 or " - "3 (not including self)"); - return PythonObject(); - } - - // FIXME: At this point we should check that the class we found supports all - // the methods that we need. - - return result; -} - -bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( - void *implementer, const char *method_name, lldb_private::Event *event, - bool &got_error) { - got_error = false; - - PyErr_Cleaner py_err_cleaner(false); - PythonObject self(PyRefType::Borrowed, static_cast(implementer)); - auto pfunc = self.ResolveName(method_name); - - if (!pfunc.IsAllocated()) - return false; - - PythonObject result; - if (event != nullptr) { - ScopedPythonObject event_arg = SWIGBridge::ToSWIGWrapper(event); - result = pfunc(event_arg.obj()); - } else - result = pfunc(); - - if (PyErr_Occurred()) { - got_error = true; - printf("Return value was neither false nor true for call to %s.\n", - method_name); - PyErr_Print(); - return false; - } - - if (result.get() == Py_True) - return true; - else if (result.get() == Py_False) - return false; - - // Somebody returned the wrong thing... - got_error = true; - printf("Wrong return value type for call to %s.\n", method_name); - return false; -} - -bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( - void *implementer, const char *method_name, lldb_private::Stream *stream, - bool &got_error) { - got_error = false; - - PyErr_Cleaner py_err_cleaner(false); - PythonObject self(PyRefType::Borrowed, static_cast(implementer)); - auto pfunc = self.ResolveName(method_name); - - if (!pfunc.IsAllocated()) - return false; - - auto *sb_stream = new lldb::SBStream(); - PythonObject sb_stream_arg = - SWIGBridge::ToSWIGWrapper(std::unique_ptr(sb_stream)); - - PythonObject result; - result = pfunc(sb_stream_arg); - - if (PyErr_Occurred()) { - printf("Error occured for call to %s.\n", - method_name); - PyErr_Print(); - got_error = true; - return false; - } - if (stream) - stream->PutCString(sb_stream->GetData()); - return true; - -} - PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedBreakpointResolver( const char *python_class_name, const char *session_dictionary_name, const StructuredDataImpl &args_impl, @@ -500,9 +373,8 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop( if (!pfunc.IsAllocated()) return true; - auto *sb_stream = new lldb::SBStream(); - PythonObject sb_stream_arg = - SWIGBridge::ToSWIGWrapper(std::unique_ptr(sb_stream)); + std::shared_ptr sb_stream = std::make_shared(); + PythonObject sb_stream_arg = SWIGBridge::ToSWIGWrapper(sb_stream); PythonObject result = pfunc(SWIGBridge::ToSWIGWrapper(std::move(exc_ctx_sp)), sb_stream_arg); @@ -517,6 +389,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop( // makes an internally help StreamString which I can't interpose, so I // have to copy it over here. stream->PutCString(sb_stream->GetData()); + sb_stream_arg.release(); if (result.get() == Py_False) return false; @@ -753,6 +626,30 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject * data return sb_ptr; } +void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject * data) { + lldb::SBEvent *sb_ptr = nullptr; + + int valid_cast = + SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBEvent, 0); + + if (valid_cast == -1) + return NULL; + + return sb_ptr; +} + +void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject * data) { + lldb::SBStream *sb_ptr = nullptr; + + int valid_cast = + SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBStream, 0); + + if (valid_cast == -1) + return NULL; + + return sb_ptr; +} + void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject * data) { lldb::SBValue *sb_ptr = NULL; diff --git a/lldb/include/lldb/API/SBEvent.h b/lldb/include/lldb/API/SBEvent.h index cc116766e85f..85b401ca8cc1 100644 --- a/lldb/include/lldb/API/SBEvent.h +++ b/lldb/include/lldb/API/SBEvent.h @@ -15,6 +15,7 @@ #include namespace lldb_private { +class ScriptInterpreter; namespace python { class SWIGBridge; } @@ -73,11 +74,12 @@ class LLDB_API SBEvent { friend class SBThread; friend class SBWatchpoint; + friend class lldb_private::ScriptInterpreter; friend class lldb_private::python::SWIGBridge; SBEvent(lldb::EventSP &event_sp); - SBEvent(lldb_private::Event *event_sp); + SBEvent(lldb_private::Event *event); lldb::EventSP &GetSP() const; diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h index 0e33f05b6991..2db379fe12f0 100644 --- a/lldb/include/lldb/API/SBStream.h +++ b/lldb/include/lldb/API/SBStream.h @@ -13,6 +13,10 @@ #include "lldb/API/SBDefines.h" +namespace lldb_private { +class ScriptInterpreter; +} // namespace lldb_private + namespace lldb { class LLDB_API SBStream { @@ -101,6 +105,8 @@ class LLDB_API SBStream { friend class SBValue; friend class SBWatchpoint; + friend class lldb_private::ScriptInterpreter; + lldb_private::Stream *operator->(); lldb_private::Stream *get(); diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h index 9753a916243b..69504dbcda5d 100644 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h @@ -10,7 +10,6 @@ #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H #include "lldb/Core/StructuredDataImpl.h" -#include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/UnimplementedError.h" @@ -52,7 +51,8 @@ class ScriptedInterface { } template - bool CheckStructuredDataObject(llvm::StringRef caller, T obj, Status &error) { + static bool CheckStructuredDataObject(llvm::StringRef caller, T obj, + Status &error) { if (!obj) return ErrorWithMessage(caller, "Null Structured Data object", error); diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h new file mode 100644 index 000000000000..9130f9412cb0 --- /dev/null +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h @@ -0,0 +1,38 @@ +//===-- ScriptedThreadPlanInterface.h ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H +#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H + +#include "lldb/lldb-private.h" + +#include "ScriptedInterface.h" + +namespace lldb_private { +class ScriptedThreadPlanInterface : public ScriptedInterface { +public: + virtual llvm::Expected + CreatePluginObject(llvm::StringRef class_name, + lldb::ThreadPlanSP thread_plan_sp, + const StructuredDataImpl &args_sp) = 0; + + virtual llvm::Expected ExplainsStop(Event *event) { return true; } + + virtual llvm::Expected ShouldStop(Event *event) { return true; } + + virtual llvm::Expected IsStale() { return true; }; + + virtual lldb::StateType GetRunState() { return lldb::eStateStepping; } + + virtual llvm::Expected GetStopDescription(lldb_private::Stream *s) { + return true; + } +}; +} // namespace lldb_private + +#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 932eaa8b8a4a..e821a7db2c67 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -13,8 +13,10 @@ #include "lldb/API/SBBreakpoint.h" #include "lldb/API/SBData.h" #include "lldb/API/SBError.h" +#include "lldb/API/SBEvent.h" #include "lldb/API/SBLaunchInfo.h" #include "lldb/API/SBMemoryRegionInfo.h" +#include "lldb/API/SBStream.h" #include "lldb/Breakpoint/BreakpointOptions.h" #include "lldb/Core/PluginInterface.h" #include "lldb/Core/SearchFilter.h" @@ -250,50 +252,6 @@ class ScriptInterpreter : public PluginInterface { return lldb::ValueObjectListSP(); } - virtual StructuredData::ObjectSP - CreateScriptedThreadPlan(const char *class_name, - const StructuredDataImpl &args_data, - std::string &error_str, - lldb::ThreadPlanSP thread_plan_sp) { - return StructuredData::ObjectSP(); - } - - virtual bool - ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp, - Event *event, bool &script_error) { - script_error = true; - return true; - } - - virtual bool - ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp, - Event *event, bool &script_error) { - script_error = true; - return true; - } - - virtual bool - ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp, - bool &script_error) { - script_error = true; - return true; - } - - virtual lldb::StateType - ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp, - bool &script_error) { - script_error = true; - return lldb::eStateStepping; - } - - virtual bool - ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp, - lldb_private::Stream *stream, - bool &script_error) { - script_error = true; - return false; - } - virtual StructuredData::GenericSP CreateScriptedBreakpointResolver(const char *class_name, const StructuredDataImpl &args_data, @@ -592,6 +550,11 @@ class ScriptInterpreter : public PluginInterface { return {}; } + virtual lldb::ScriptedThreadPlanInterfaceSP + CreateScriptedThreadPlanInterface() { + return {}; + } + virtual lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() { return {}; } @@ -610,6 +573,10 @@ class ScriptInterpreter : public PluginInterface { Status GetStatusFromSBError(const lldb::SBError &error) const; + Event *GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const; + + Stream *GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const; + lldb::BreakpointSP GetOpaqueTypeFromSBBreakpoint(const lldb::SBBreakpoint &breakpoint) const; diff --git a/lldb/include/lldb/Target/ThreadPlanPython.h b/lldb/include/lldb/Target/ThreadPlanPython.h index 64854d66b8f2..da106faf951d 100644 --- a/lldb/include/lldb/Target/ThreadPlanPython.h +++ b/lldb/include/lldb/Target/ThreadPlanPython.h @@ -13,6 +13,7 @@ #include #include "lldb/Core/StructuredDataImpl.h" +#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h" #include "lldb/Target/Process.h" #include "lldb/Target/StopInfo.h" #include "lldb/Target/Target.h" @@ -70,6 +71,7 @@ class ThreadPlanPython : public ThreadPlan { StreamString m_stop_description; // Cache the stop description here bool m_did_push; bool m_stop_others; + lldb::ScriptedThreadPlanInterfaceSP m_interface; ThreadPlanPython(const ThreadPlanPython &) = delete; const ThreadPlanPython &operator=(const ThreadPlanPython &) = delete; diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index 10ba921b9dac..e2b24819bce9 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -187,6 +187,7 @@ class ScriptedMetadata; class ScriptedPlatformInterface; class ScriptedProcessInterface; class ScriptedThreadInterface; +class ScriptedThreadPlanInterface; class ScriptedSyntheticChildren; class SearchFilter; class Section; @@ -403,6 +404,8 @@ typedef std::unique_ptr ScriptedProcessInterfaceUP; typedef std::shared_ptr ScriptedThreadInterfaceSP; +typedef std::shared_ptr + ScriptedThreadPlanInterfaceSP; typedef std::shared_ptr SectionSP; typedef std::unique_ptr SectionListUP; typedef std::weak_ptr SectionWP; diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 8dd499ce819a..75b2a39a8d11 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -101,6 +101,19 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const { return Status(); } +Event * +ScriptInterpreter::GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const { + return event.m_opaque_ptr; +} + +Stream *ScriptInterpreter::GetOpaqueTypeFromSBStream( + const lldb::SBStream &stream) const { + if (stream.m_opaque_up) + return const_cast(stream).m_opaque_up.get(); + + return nullptr; +} + std::optional ScriptInterpreter::GetOpaqueTypeFromSBMemoryRegionInfo( const lldb::SBMemoryRegionInfo &mem_region) const { diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt index b22abc49c92a..c60e4bb503a3 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt @@ -24,6 +24,7 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces ScriptedPythonInterface.cpp ScriptedProcessPythonInterface.cpp ScriptedThreadPythonInterface.cpp + ScriptedThreadPlanPythonInterface.cpp ScriptedPlatformPythonInterface.cpp LINK_LIBS diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp index 9ba4731032bd..6e93bec80056 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp @@ -20,6 +20,8 @@ #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedPlatformPythonInterface.h" +#include "lldb/Target/ExecutionContext.h" + using namespace lldb; using namespace lldb_private; using namespace lldb_private::python; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp index e86b34d6b930..313c597ce48f 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp @@ -49,7 +49,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetCapabilities() { StructuredData::DictionarySP dict = Dispatch("get_capabilities", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, + error)) return {}; return dict; @@ -90,7 +91,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetThreadsInfo() { StructuredData::DictionarySP dict = Dispatch("get_threads_info", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, + error)) return {}; return dict; @@ -106,7 +108,8 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr, if (py_error.Fail()) error = py_error; - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return {}; return obj->GetBooleanValue(); @@ -131,7 +134,8 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress( StructuredData::ObjectSP obj = Dispatch("write_memory_at_address", py_error, addr, data_sp, error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return LLDB_INVALID_OFFSET; // If there was an error on the python call, surface it to the user. @@ -146,7 +150,8 @@ StructuredData::ArraySP ScriptedProcessPythonInterface::GetLoadedImages() { StructuredData::ArraySP array = Dispatch("get_loaded_images", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, + error)) return {}; return array; @@ -156,7 +161,8 @@ lldb::pid_t ScriptedProcessPythonInterface::GetProcessID() { Status error; StructuredData::ObjectSP obj = Dispatch("get_process_id", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return LLDB_INVALID_PROCESS_ID; return obj->GetUnsignedIntegerValue(LLDB_INVALID_PROCESS_ID); @@ -166,7 +172,8 @@ bool ScriptedProcessPythonInterface::IsAlive() { Status error; StructuredData::ObjectSP obj = Dispatch("is_alive", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return {}; return obj->GetBooleanValue(); @@ -177,7 +184,8 @@ ScriptedProcessPythonInterface::GetScriptedThreadPluginName() { Status error; StructuredData::ObjectSP obj = Dispatch("get_scripted_thread_plugin", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return {}; return obj->GetStringValue().str(); @@ -193,7 +201,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() { StructuredData::DictionarySP dict = Dispatch("get_process_metadata", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, + error)) return {}; return dict; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index 6f22503b279c..7d072212676e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -26,6 +26,15 @@ ScriptedPythonInterface::ScriptedPythonInterface( ScriptInterpreterPythonImpl &interpreter) : ScriptedInterface(), m_interpreter(interpreter) {} +template <> +void ScriptedPythonInterface::ReverseTransform( + lldb_private::Stream *&original_arg, python::PythonObject transformed_arg, + Status &error) { + Stream *s = ExtractValueFromPythonObject(transformed_arg, error); + *original_arg = *s; + original_arg->PutCString(static_cast(s)->GetData()); +} + template <> StructuredData::ArraySP ScriptedPythonInterface::ExtractValueFromPythonObject( @@ -48,12 +57,33 @@ Status ScriptedPythonInterface::ExtractValueFromPythonObject( if (lldb::SBError *sb_error = reinterpret_cast( python::LLDBSWIGPython_CastPyObjectToSBError(p.get()))) return m_interpreter.GetStatusFromSBError(*sb_error); - else - error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status."); + error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status."); return {}; } +template <> +Event *ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error) { + if (lldb::SBEvent *sb_event = reinterpret_cast( + python::LLDBSWIGPython_CastPyObjectToSBEvent(p.get()))) + return m_interpreter.GetOpaqueTypeFromSBEvent(*sb_event); + error.SetErrorString("Couldn't cast lldb::SBEvent to lldb_private::Event."); + + return nullptr; +} + +template <> +Stream *ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error) { + if (lldb::SBStream *sb_stream = reinterpret_cast( + python::LLDBSWIGPython_CastPyObjectToSBStream(p.get()))) + return m_interpreter.GetOpaqueTypeFromSBStream(*sb_stream); + error.SetErrorString("Couldn't cast lldb::SBStream to lldb_private::Stream."); + + return nullptr; +} + template <> lldb::DataExtractorSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index 163659234466..062bf1fcff4a 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -115,7 +115,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { PythonObject::ResolveNameWithDictionary( class_name, dict); if (!init.IsAllocated()) - return create_error(llvm::formatv("Could not find script class: %s", + return create_error(llvm::formatv("Could not find script class: {0}", class_name.data())); std::tuple original_args = std::forward_as_tuple(args...); @@ -248,8 +248,11 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { (PyObject *)m_object_instance_sp->GetValue()); if (!implementor.IsAllocated()) - return ErrorWithMessage(caller_signature, - "Python implementor not allocated.", error); + return llvm::is_contained(GetAbstractMethods(), method_name) + ? ErrorWithMessage(caller_signature, + "Python implementor not allocated.", + error) + : T{}; std::tuple original_args = std::forward_as_tuple(args...); auto transformed_args = TransformArgs(original_args); @@ -322,6 +325,10 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return python::SWIGBridge::ToSWIGWrapper(arg); } + python::PythonObject Transform(lldb::ThreadPlanSP arg) { + return python::SWIGBridge::ToSWIGWrapper(arg); + } + python::PythonObject Transform(lldb::ProcessAttachInfoSP arg) { return python::SWIGBridge::ToSWIGWrapper(arg); } @@ -330,6 +337,14 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return python::SWIGBridge::ToSWIGWrapper(arg); } + python::PythonObject Transform(Event *arg) { + return python::SWIGBridge::ToSWIGWrapper(arg); + } + + python::PythonObject Transform(Stream *arg) { + return python::SWIGBridge::ToSWIGWrapper(arg); + } + python::PythonObject Transform(lldb::DataExtractorSP arg) { return python::SWIGBridge::ToSWIGWrapper(arg); } @@ -427,6 +442,14 @@ template <> Status ScriptedPythonInterface::ExtractValueFromPythonObject( python::PythonObject &p, Status &error); +template <> +Event *ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error); + +template <> +Stream *ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error); + template <> lldb::BreakpointSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp new file mode 100644 index 000000000000..b7e475812f22 --- /dev/null +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp @@ -0,0 +1,105 @@ +//===-- ScriptedThreadPlanPythonInterface.cpp -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Host/Config.h" +#include "lldb/Utility/Log.h" +#include "lldb/lldb-enumerations.h" + +#if LLDB_ENABLE_PYTHON + +// LLDB Python header must be included first +#include "../lldb-python.h" + +#include "../SWIGPythonBridge.h" +#include "../ScriptInterpreterPythonImpl.h" +#include "ScriptedThreadPlanPythonInterface.h" + +using namespace lldb; +using namespace lldb_private; +using namespace lldb_private::python; + +ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface( + ScriptInterpreterPythonImpl &interpreter) + : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {} + +llvm::Expected +ScriptedThreadPlanPythonInterface::CreatePluginObject( + const llvm::StringRef class_name, lldb::ThreadPlanSP thread_plan_sp, + const StructuredDataImpl &args_sp) { + return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr, + thread_plan_sp, args_sp); +} + +llvm::Expected +ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) { + Status error; + StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event); + + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) { + if (!obj) + return false; + return error.ToError(); + } + + return obj->GetBooleanValue(); +} + +llvm::Expected +ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) { + Status error; + StructuredData::ObjectSP obj = Dispatch("should_stop", error, event); + + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) { + if (!obj) + return false; + return error.ToError(); + } + + return obj->GetBooleanValue(); +} + +llvm::Expected ScriptedThreadPlanPythonInterface::IsStale() { + Status error; + StructuredData::ObjectSP obj = Dispatch("is_stale", error); + + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) { + if (!obj) + return false; + return error.ToError(); + } + + return obj->GetBooleanValue(); +} + +lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() { + Status error; + StructuredData::ObjectSP obj = Dispatch("should_step", error); + + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) + return lldb::eStateStepping; + + return static_cast(obj->GetUnsignedIntegerValue( + static_cast(lldb::eStateStepping))); +} + +llvm::Expected +ScriptedThreadPlanPythonInterface::GetStopDescription(lldb_private::Stream *s) { + Status error; + Dispatch("stop_description", error, s); + + if (error.Fail()) + return error.ToError(); + + return true; +} + +#endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h new file mode 100644 index 000000000000..33f086786c47 --- /dev/null +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h @@ -0,0 +1,48 @@ +//===-- ScriptedThreadPlanPythonInterface.h ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H +#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H + +#include "lldb/Host/Config.h" + +#if LLDB_ENABLE_PYTHON + +#include "ScriptedPythonInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h" +#include + +namespace lldb_private { +class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface, + public ScriptedPythonInterface { +public: + ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter); + + llvm::Expected + CreatePluginObject(const llvm::StringRef class_name, + lldb::ThreadPlanSP thread_plan_sp, + const StructuredDataImpl &args_sp) override; + + llvm::SmallVector GetAbstractMethods() const override { + return {}; + } + + llvm::Expected ExplainsStop(Event *event) override; + + llvm::Expected ShouldStop(Event *event) override; + + llvm::Expected IsStale() override; + + lldb::StateType GetRunState() override; + + llvm::Expected GetStopDescription(lldb_private::Stream *s) override; +}; +} // namespace lldb_private + +#endif // LLDB_ENABLE_PYTHON +#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp index 18e268527eb2..8af89d761764 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/Host/Config.h" +#include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" @@ -44,7 +45,8 @@ lldb::tid_t ScriptedThreadPythonInterface::GetThreadID() { Status error; StructuredData::ObjectSP obj = Dispatch("get_thread_id", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return LLDB_INVALID_THREAD_ID; return obj->GetUnsignedIntegerValue(LLDB_INVALID_THREAD_ID); @@ -54,7 +56,8 @@ std::optional ScriptedThreadPythonInterface::GetName() { Status error; StructuredData::ObjectSP obj = Dispatch("get_name", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return {}; return obj->GetStringValue().str(); @@ -64,7 +67,8 @@ lldb::StateType ScriptedThreadPythonInterface::GetState() { Status error; StructuredData::ObjectSP obj = Dispatch("get_state", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return eStateInvalid; return static_cast(obj->GetUnsignedIntegerValue(eStateInvalid)); @@ -74,7 +78,8 @@ std::optional ScriptedThreadPythonInterface::GetQueue() { Status error; StructuredData::ObjectSP obj = Dispatch("get_queue", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return {}; return obj->GetStringValue().str(); @@ -85,7 +90,8 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetStopReason() { StructuredData::DictionarySP dict = Dispatch("get_stop_reason", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, + error)) return {}; return dict; @@ -96,7 +102,8 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetStackFrames() { StructuredData::ArraySP arr = Dispatch("get_stackframes", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, + error)) return {}; return arr; @@ -107,7 +114,8 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetRegisterInfo() { StructuredData::DictionarySP dict = Dispatch("get_register_info", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, + error)) return {}; return dict; @@ -117,7 +125,8 @@ std::optional ScriptedThreadPythonInterface::GetRegisterContext() { Status error; StructuredData::ObjectSP obj = Dispatch("get_register_context", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) return {}; return obj->GetAsString()->GetValue().str(); @@ -128,7 +137,8 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetExtendedInfo() { StructuredData::ArraySP arr = Dispatch("get_extended_info", error); - if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error)) + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, + error)) return {}; return arr; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index c1a11b9134d6..95eb5a782097 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -97,12 +97,14 @@ class SWIGBridge { static PythonObject ToSWIGWrapper(lldb::ExecutionContextRefSP ctx_sp); static PythonObject ToSWIGWrapper(const TypeSummaryOptions &summary_options); static PythonObject ToSWIGWrapper(const SymbolContext &sym_ctx); + static PythonObject ToSWIGWrapper(const Stream *stream); + static PythonObject ToSWIGWrapper(std::shared_ptr stream_sb); + static PythonObject ToSWIGWrapper(Event *event); static PythonObject ToSWIGWrapper(lldb::ProcessAttachInfoSP attach_info_sp); static PythonObject ToSWIGWrapper(lldb::ProcessLaunchInfoSP launch_info_sp); static PythonObject ToSWIGWrapper(lldb::DataExtractorSP data_extractor_sp); - static PythonObject ToSWIGWrapper(std::unique_ptr stream_sb); static PythonObject ToSWIGWrapper(std::unique_ptr data_sb); static PythonObject @@ -112,7 +114,6 @@ class SWIGBridge { static python::ScopedPythonObject ToSWIGWrapper(CommandReturnObject &cmd_retobj); - static python::ScopedPythonObject ToSWIGWrapper(Event *event); // These prototypes are the Pythonic implementations of the required // callbacks. Although these are scripting-language specific, their definition // depends on the public API. @@ -147,21 +148,6 @@ class SWIGBridge { const char *session_dictionary_name, lldb::DebuggerSP debugger_sp); - static python::PythonObject LLDBSwigPythonCreateScriptedThreadPlan( - const char *python_class_name, const char *session_dictionary_name, - const StructuredDataImpl &args_data, std::string &error_string, - const lldb::ThreadPlanSP &thread_plan_sp); - - static bool LLDBSWIGPythonCallThreadPlan(void *implementor, - const char *method_name, - lldb_private::Event *event_sp, - bool &got_error); - - static bool LLDBSWIGPythonCallThreadPlan(void *implementor, - const char *method_name, - lldb_private::Stream *stream, - bool &got_error); - static python::PythonObject LLDBSwigPythonCreateScriptedBreakpointResolver( const char *python_class_name, const char *session_dictionary_name, const StructuredDataImpl &args, const lldb::BreakpointSP &bkpt_sp); @@ -269,6 +255,8 @@ void *LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data); +void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data); +void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data); } // namespace python diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index ce52f3595247..58ef8f674f72 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -17,6 +17,7 @@ #include "Interfaces/OperatingSystemPythonInterface.h" #include "Interfaces/ScriptedPlatformPythonInterface.h" #include "Interfaces/ScriptedProcessPythonInterface.h" +#include "Interfaces/ScriptedThreadPlanPythonInterface.h" #include "Interfaces/ScriptedThreadPythonInterface.h" #include "PythonDataObjects.h" #include "PythonReadline.h" @@ -1537,6 +1538,11 @@ ScriptInterpreterPythonImpl::CreateScriptedThreadInterface() { return std::make_shared(*this); } +ScriptedThreadPlanInterfaceSP +ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() { + return std::make_shared(*this); +} + OperatingSystemInterfaceSP ScriptInterpreterPythonImpl::CreateOperatingSystemInterface() { return std::make_shared(*this); @@ -1553,122 +1559,6 @@ ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject( return py_obj.CreateStructuredObject(); } -StructuredData::ObjectSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlan( - const char *class_name, const StructuredDataImpl &args_data, - std::string &error_str, lldb::ThreadPlanSP thread_plan_sp) { - if (class_name == nullptr || class_name[0] == '\0') - return StructuredData::ObjectSP(); - - if (!thread_plan_sp.get()) - return {}; - - Debugger &debugger = thread_plan_sp->GetTarget().GetDebugger(); - ScriptInterpreterPythonImpl *python_interpreter = - GetPythonInterpreter(debugger); - - if (!python_interpreter) - return {}; - - Locker py_lock(this, - Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - PythonObject ret_val = SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan( - class_name, python_interpreter->m_dictionary_name.c_str(), args_data, - error_str, thread_plan_sp); - if (!ret_val) - return {}; - - return StructuredData::ObjectSP( - new StructuredPythonObject(std::move(ret_val))); -} - -bool ScriptInterpreterPythonImpl::ScriptedThreadPlanExplainsStop( - StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) { - bool explains_stop = true; - StructuredData::Generic *generic = nullptr; - if (implementor_sp) - generic = implementor_sp->GetAsGeneric(); - if (generic) { - Locker py_lock(this, - Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - explains_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan( - generic->GetValue(), "explains_stop", event, script_error); - if (script_error) - return true; - } - return explains_stop; -} - -bool ScriptInterpreterPythonImpl::ScriptedThreadPlanShouldStop( - StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) { - bool should_stop = true; - StructuredData::Generic *generic = nullptr; - if (implementor_sp) - generic = implementor_sp->GetAsGeneric(); - if (generic) { - Locker py_lock(this, - Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - should_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan( - generic->GetValue(), "should_stop", event, script_error); - if (script_error) - return true; - } - return should_stop; -} - -bool ScriptInterpreterPythonImpl::ScriptedThreadPlanIsStale( - StructuredData::ObjectSP implementor_sp, bool &script_error) { - bool is_stale = true; - StructuredData::Generic *generic = nullptr; - if (implementor_sp) - generic = implementor_sp->GetAsGeneric(); - if (generic) { - Locker py_lock(this, - Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - is_stale = SWIGBridge::LLDBSWIGPythonCallThreadPlan( - generic->GetValue(), "is_stale", (Event *)nullptr, script_error); - if (script_error) - return true; - } - return is_stale; -} - -lldb::StateType ScriptInterpreterPythonImpl::ScriptedThreadPlanGetRunState( - StructuredData::ObjectSP implementor_sp, bool &script_error) { - bool should_step = false; - StructuredData::Generic *generic = nullptr; - if (implementor_sp) - generic = implementor_sp->GetAsGeneric(); - if (generic) { - Locker py_lock(this, - Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - should_step = SWIGBridge::LLDBSWIGPythonCallThreadPlan( - generic->GetValue(), "should_step", (Event *)nullptr, script_error); - if (script_error) - should_step = true; - } - if (should_step) - return lldb::eStateStepping; - return lldb::eStateRunning; -} - -bool -ScriptInterpreterPythonImpl::ScriptedThreadPlanGetStopDescription( - StructuredData::ObjectSP implementor_sp, lldb_private::Stream *stream, - bool &script_error) { - StructuredData::Generic *generic = nullptr; - if (implementor_sp) - generic = implementor_sp->GetAsGeneric(); - if (!generic) { - script_error = true; - return false; - } - Locker py_lock(this, - Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - return SWIGBridge::LLDBSWIGPythonCallThreadPlan( - generic->GetValue(), "stop_description", stream, script_error); -} - - StructuredData::GenericSP ScriptInterpreterPythonImpl::CreateScriptedBreakpointResolver( const char *class_name, const StructuredDataImpl &args_data, diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index fcd21dff612b..fa2354053473 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -77,34 +77,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { StructuredData::GenericSP CreateScriptCommandObject(const char *class_name) override; - StructuredData::ObjectSP - CreateScriptedThreadPlan(const char *class_name, - const StructuredDataImpl &args_data, - std::string &error_str, - lldb::ThreadPlanSP thread_plan) override; - StructuredData::ObjectSP CreateStructuredDataFromScriptObject(ScriptObject obj) override; - bool ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp, - Event *event, - bool &script_error) override; - - bool ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp, - Event *event, bool &script_error) override; - - bool ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp, - bool &script_error) override; - - lldb::StateType - ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp, - bool &script_error) override; - - bool - ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp, - lldb_private::Stream *s, - bool &script_error) override; - StructuredData::GenericSP CreateScriptedBreakpointResolver(const char *class_name, const StructuredDataImpl &args_data, @@ -136,6 +111,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override; + lldb::ScriptedThreadPlanInterfaceSP + CreateScriptedThreadPlanInterface() override; + lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() override; StructuredData::ObjectSP diff --git a/lldb/source/Target/ThreadPlanPython.cpp b/lldb/source/Target/ThreadPlanPython.cpp index d6de6b3c3cf0..65d1737c2dc5 100644 --- a/lldb/source/Target/ThreadPlanPython.cpp +++ b/lldb/source/Target/ThreadPlanPython.cpp @@ -10,6 +10,7 @@ #include "lldb/Core/Debugger.h" #include "lldb/Interpreter/CommandInterpreter.h" +#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h" #include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Target/Process.h" #include "lldb/Target/RegisterContext.h" @@ -32,6 +33,23 @@ ThreadPlanPython::ThreadPlanPython(Thread &thread, const char *class_name, eVoteNoOpinion, eVoteNoOpinion), m_class_name(class_name), m_args_data(args_data), m_did_push(false), m_stop_others(false) { + ScriptInterpreter *interpreter = GetScriptInterpreter(); + if (!interpreter) { + SetPlanComplete(false); + // FIXME: error handling + return; + } + + m_interface = interpreter->CreateScriptedThreadPlanInterface(); + if (!m_interface) { + SetPlanComplete(false); + // FIXME: error handling + // error.SetErrorStringWithFormat( + // "ThreadPlanPython::%s () - ERROR: %s", __FUNCTION__, + // "Script interpreter couldn't create Scripted Thread Plan Interface"); + return; + } + SetIsControllingPlan(true); SetOkayToDiscard(true); SetPrivate(false); @@ -60,13 +78,14 @@ void ThreadPlanPython::DidPush() { // We set up the script side in DidPush, so that it can push other plans in // the constructor, and doesn't have to care about the details of DidPush. m_did_push = true; - if (!m_class_name.empty()) { - ScriptInterpreter *script_interp = GetScriptInterpreter(); - if (script_interp) { - m_implementation_sp = script_interp->CreateScriptedThreadPlan( - m_class_name.c_str(), m_args_data, m_error_str, - this->shared_from_this()); - } + if (m_interface) { + auto obj_or_err = m_interface->CreatePluginObject( + m_class_name, this->shared_from_this(), m_args_data); + if (!obj_or_err) { + m_error_str = llvm::toString(obj_or_err.takeError()); + SetPlanComplete(false); + } else + m_implementation_sp = *obj_or_err; } } @@ -77,14 +96,13 @@ bool ThreadPlanPython::ShouldStop(Event *event_ptr) { bool should_stop = true; if (m_implementation_sp) { - ScriptInterpreter *script_interp = GetScriptInterpreter(); - if (script_interp) { - bool script_error; - should_stop = script_interp->ScriptedThreadPlanShouldStop( - m_implementation_sp, event_ptr, script_error); - if (script_error) - SetPlanComplete(false); - } + auto should_stop_or_err = m_interface->ShouldStop(event_ptr); + if (!should_stop_or_err) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), should_stop_or_err.takeError(), + "Can't call ScriptedThreadPlan::ShouldStop."); + SetPlanComplete(false); + } else + should_stop = *should_stop_or_err; } return should_stop; } @@ -96,14 +114,13 @@ bool ThreadPlanPython::IsPlanStale() { bool is_stale = true; if (m_implementation_sp) { - ScriptInterpreter *script_interp = GetScriptInterpreter(); - if (script_interp) { - bool script_error; - is_stale = script_interp->ScriptedThreadPlanIsStale(m_implementation_sp, - script_error); - if (script_error) - SetPlanComplete(false); - } + auto is_stale_or_err = m_interface->IsStale(); + if (!is_stale_or_err) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), is_stale_or_err.takeError(), + "Can't call ScriptedThreadPlan::IsStale."); + SetPlanComplete(false); + } else + is_stale = *is_stale_or_err; } return is_stale; } @@ -115,14 +132,14 @@ bool ThreadPlanPython::DoPlanExplainsStop(Event *event_ptr) { bool explains_stop = true; if (m_implementation_sp) { - ScriptInterpreter *script_interp = GetScriptInterpreter(); - if (script_interp) { - bool script_error; - explains_stop = script_interp->ScriptedThreadPlanExplainsStop( - m_implementation_sp, event_ptr, script_error); - if (script_error) - SetPlanComplete(false); - } + auto explains_stop_or_error = m_interface->ExplainsStop(event_ptr); + if (!explains_stop_or_error) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), + explains_stop_or_error.takeError(), + "Can't call ScriptedThreadPlan::ExplainsStop."); + SetPlanComplete(false); + } else + explains_stop = *explains_stop_or_error; } return explains_stop; } @@ -150,14 +167,8 @@ lldb::StateType ThreadPlanPython::GetPlanRunState() { LLDB_LOGF(log, "%s called on Python Thread Plan: %s )", LLVM_PRETTY_FUNCTION, m_class_name.c_str()); lldb::StateType run_state = eStateRunning; - if (m_implementation_sp) { - ScriptInterpreter *script_interp = GetScriptInterpreter(); - if (script_interp) { - bool script_error; - run_state = script_interp->ScriptedThreadPlanGetRunState( - m_implementation_sp, script_error); - } - } + if (m_implementation_sp) + run_state = m_interface->GetRunState(); return run_state; } @@ -168,12 +179,13 @@ void ThreadPlanPython::GetDescription(Stream *s, lldb::DescriptionLevel level) { if (m_implementation_sp) { ScriptInterpreter *script_interp = GetScriptInterpreter(); if (script_interp) { - bool script_error; - bool added_desc = script_interp->ScriptedThreadPlanGetStopDescription( - m_implementation_sp, s, script_error); - if (script_error || !added_desc) + auto desc_or_err = m_interface->GetStopDescription(s); + if (!desc_or_err || !*desc_or_err) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), desc_or_err.takeError(), + "Can't call ScriptedThreadPlan::GetStopDescription."); s->Printf("Python thread plan implemented by class %s.", m_class_name.c_str()); + } } return; } diff --git a/lldb/test/API/functionalities/step_scripted/Steps.py b/lldb/test/API/functionalities/step_scripted/Steps.py index 7527607be847..3325dba75365 100644 --- a/lldb/test/API/functionalities/step_scripted/Steps.py +++ b/lldb/test/API/functionalities/step_scripted/Steps.py @@ -47,7 +47,7 @@ def queue_child_thread_plan(self): # This plan does a step-over until a variable changes value. class StepUntil(StepWithChild): - def __init__(self, thread_plan, args_data, dict): + def __init__(self, thread_plan, args_data): self.thread_plan = thread_plan self.frame = thread_plan.GetThread().frames[0] self.target = thread_plan.GetThread().GetProcess().GetTarget() @@ -99,7 +99,7 @@ def stop_description(self, stream): class StepReportsStopOthers: stop_mode_dict = {} - def __init__(self, thread_plan, args_data, dict): + def __init__(self, thread_plan, args_data): self.thread_plan = thread_plan self.key = str(args_data.GetValueForKey("token").GetUnsignedIntegerValue(1000)) diff --git a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py index 802aaf2d3ffd..ebb795abfa0e 100644 --- a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py +++ b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py @@ -2,7 +2,7 @@ class WrapStepOver: - def __init__(self, thread_plan, args_data, dict): + def __init__(self, thread_plan, args_data): self.plan = thread_plan thread = thread_plan.GetThread() target = thread.GetProcess().GetTarget() diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 23162436d42c..017953b372e3 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -80,26 +80,6 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject( return python::PythonObject(); } -python::PythonObject -lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan( - const char *python_class_name, const char *session_dictionary_name, - const StructuredDataImpl &args_data, std::string &error_string, - const lldb::ThreadPlanSP &thread_plan_sp) { - return python::PythonObject(); -} - -bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( - void *implementor, const char *method_name, Event *event_sp, - bool &got_error) { - return false; -} - -bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( - void *implementor, const char *method_name, Stream *event_sp, - bool &got_error) { - return false; -} - python::PythonObject lldb_private::python::SWIGBridge:: LLDBSwigPythonCreateScriptedBreakpointResolver( const char *python_class_name, const char *session_dictionary_name, @@ -154,6 +134,16 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject *data) { return nullptr; } +void * +lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data) { + return nullptr; +} + +void * +lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) { + return nullptr; +} + void * lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data) { return nullptr; @@ -311,6 +301,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ExecutionContextRefSP) { return python::PythonObject(); } +python::PythonObject +lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP) { + return python::PythonObject(); +} + python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) { return python::PythonObject(); @@ -320,3 +315,18 @@ python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper( const lldb_private::StructuredDataImpl &) { return python::PythonObject(); } + +python::PythonObject +lldb_private::python::SWIGBridge::ToSWIGWrapper(Event *event) { + return python::PythonObject(); +} + +python::PythonObject +lldb_private::python::SWIGBridge::ToSWIGWrapper(const Stream *stream) { + return python::PythonObject(); +} + +python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper( + std::shared_ptr stream_sb) { + return python::PythonObject(); +} From 8f50bcaadced3041a6a132286e3b62ad6fd9cf74 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 23 May 2024 08:27:22 +0000 Subject: [PATCH 006/433] [gn build] Port 781b13538e55 --- llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn index 188c71805f27..9075ada55c0f 100644 --- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn @@ -95,6 +95,7 @@ static_library("Sema") { "SemaTemplateInstantiateDecl.cpp", "SemaTemplateVariadic.cpp", "SemaType.cpp", + "SemaX86.cpp", "TypeLocBuilder.cpp", ] } From ae3f68066c8f282145435880107c1d3dc26ec3b8 Mon Sep 17 00:00:00 2001 From: Med Ismail Bennani Date: Thu, 23 May 2024 01:46:29 -0700 Subject: [PATCH 007/433] Revert "[lldb] Make use of Scripted{Python,}Interface for ScriptedThreadPlan (Reland #70392)" (#93153) Reverts llvm/llvm-project#93149 since it breaks https://lab.llvm.org/buildbot/#/builders/68/builds/74799 --- lldb/bindings/python/python-swigsafecast.swig | 17 +- lldb/bindings/python/python-wrapper.swig | 157 +++++++++++++++--- lldb/include/lldb/API/SBEvent.h | 4 +- lldb/include/lldb/API/SBStream.h | 6 - .../Interfaces/ScriptedInterface.h | 4 +- .../Interfaces/ScriptedThreadPlanInterface.h | 38 ----- .../lldb/Interpreter/ScriptInterpreter.h | 55 ++++-- lldb/include/lldb/Target/ThreadPlanPython.h | 2 - lldb/include/lldb/lldb-forward.h | 3 - lldb/source/Interpreter/ScriptInterpreter.cpp | 13 -- .../Python/Interfaces/CMakeLists.txt | 1 - .../ScriptedPlatformPythonInterface.cpp | 2 - .../ScriptedProcessPythonInterface.cpp | 27 +-- .../Interfaces/ScriptedPythonInterface.cpp | 34 +--- .../Interfaces/ScriptedPythonInterface.h | 29 +--- .../ScriptedThreadPlanPythonInterface.cpp | 105 ------------ .../ScriptedThreadPlanPythonInterface.h | 48 ------ .../ScriptedThreadPythonInterface.cpp | 28 +--- .../Python/SWIGPythonBridge.h | 22 ++- .../Python/ScriptInterpreterPython.cpp | 122 +++++++++++++- .../Python/ScriptInterpreterPythonImpl.h | 28 +++- lldb/source/Target/ThreadPlanPython.cpp | 98 +++++------ .../functionalities/step_scripted/Steps.py | 4 +- .../thread_plan/wrap_step_over.py | 2 +- .../Python/PythonTestSuite.cpp | 50 +++--- 25 files changed, 431 insertions(+), 468 deletions(-) delete mode 100644 lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig index 34f8c6f0ff8d..d5ea51487271 100644 --- a/lldb/bindings/python/python-swigsafecast.swig +++ b/lldb/bindings/python/python-swigsafecast.swig @@ -37,6 +37,10 @@ PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) { return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError); } +PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr stream_sb) { + return ToSWIGHelper(stream_sb.release(), SWIGTYPE_p_lldb__SBStream); +} + PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr data_sb) { return ToSWIGHelper(data_sb.release(), SWIGTYPE_p_lldb__SBStructuredData); } @@ -111,16 +115,9 @@ SWIGBridge::ToSWIGWrapper(CommandReturnObject &cmd_retobj) { SWIGTYPE_p_lldb__SBCommandReturnObject); } -PythonObject SWIGBridge::ToSWIGWrapper(const Stream *s) { - return ToSWIGHelper(new lldb::SBStream(), SWIGTYPE_p_lldb__SBStream); -} - -PythonObject SWIGBridge::ToSWIGWrapper(std::shared_ptr stream_sb) { - return ToSWIGHelper(stream_sb.get(), SWIGTYPE_p_lldb__SBStream); -} - -PythonObject SWIGBridge::ToSWIGWrapper(Event *event) { - return ToSWIGHelper(new lldb::SBEvent(event), SWIGTYPE_p_lldb__SBEvent); +ScopedPythonObject SWIGBridge::ToSWIGWrapper(Event *event) { + return ScopedPythonObject(new lldb::SBEvent(event), + SWIGTYPE_p_lldb__SBEvent); } PythonObject SWIGBridge::ToSWIGWrapper( diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 28ab58f8ce49..1370afc885d4 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -229,6 +229,133 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject return pfunc(SWIGBridge::ToSWIGWrapper(std::move(debugger_sp)), dict); } +PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan( + const char *python_class_name, const char *session_dictionary_name, + const lldb_private::StructuredDataImpl &args_impl, + std::string &error_string, const lldb::ThreadPlanSP &thread_plan_sp) { + if (python_class_name == NULL || python_class_name[0] == '\0' || + !session_dictionary_name) + return PythonObject(); + + PyErr_Cleaner py_err_cleaner(true); + + auto dict = PythonModule::MainModule().ResolveName( + session_dictionary_name); + auto pfunc = PythonObject::ResolveNameWithDictionary( + python_class_name, dict); + + if (!pfunc.IsAllocated()) { + error_string.append("could not find script class: "); + error_string.append(python_class_name); + return PythonObject(); + } + + PythonObject tp_arg = SWIGBridge::ToSWIGWrapper(thread_plan_sp); + + llvm::Expected arg_info = pfunc.GetArgInfo(); + if (!arg_info) { + llvm::handleAllErrors( + arg_info.takeError(), + [&](PythonException &E) { error_string.append(E.ReadBacktrace()); }, + [&](const llvm::ErrorInfoBase &E) { + error_string.append(E.message()); + }); + return PythonObject(); + } + + PythonObject result = {}; + auto args_sb = std::unique_ptr(new lldb::SBStructuredData(args_impl)); + if (arg_info.get().max_positional_args == 2) { + if (args_sb->IsValid()) { + error_string.assign( + "args passed, but __init__ does not take an args dictionary"); + return PythonObject(); + } + result = pfunc(tp_arg, dict); + } else if (arg_info.get().max_positional_args >= 3) { + result = pfunc(tp_arg, SWIGBridge::ToSWIGWrapper(std::move(args_sb)), dict); + } else { + error_string.assign("wrong number of arguments in __init__, should be 2 or " + "3 (not including self)"); + return PythonObject(); + } + + // FIXME: At this point we should check that the class we found supports all + // the methods that we need. + + return result; +} + +bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( + void *implementer, const char *method_name, lldb_private::Event *event, + bool &got_error) { + got_error = false; + + PyErr_Cleaner py_err_cleaner(false); + PythonObject self(PyRefType::Borrowed, static_cast(implementer)); + auto pfunc = self.ResolveName(method_name); + + if (!pfunc.IsAllocated()) + return false; + + PythonObject result; + if (event != nullptr) { + ScopedPythonObject event_arg = SWIGBridge::ToSWIGWrapper(event); + result = pfunc(event_arg.obj()); + } else + result = pfunc(); + + if (PyErr_Occurred()) { + got_error = true; + printf("Return value was neither false nor true for call to %s.\n", + method_name); + PyErr_Print(); + return false; + } + + if (result.get() == Py_True) + return true; + else if (result.get() == Py_False) + return false; + + // Somebody returned the wrong thing... + got_error = true; + printf("Wrong return value type for call to %s.\n", method_name); + return false; +} + +bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( + void *implementer, const char *method_name, lldb_private::Stream *stream, + bool &got_error) { + got_error = false; + + PyErr_Cleaner py_err_cleaner(false); + PythonObject self(PyRefType::Borrowed, static_cast(implementer)); + auto pfunc = self.ResolveName(method_name); + + if (!pfunc.IsAllocated()) + return false; + + auto *sb_stream = new lldb::SBStream(); + PythonObject sb_stream_arg = + SWIGBridge::ToSWIGWrapper(std::unique_ptr(sb_stream)); + + PythonObject result; + result = pfunc(sb_stream_arg); + + if (PyErr_Occurred()) { + printf("Error occured for call to %s.\n", + method_name); + PyErr_Print(); + got_error = true; + return false; + } + if (stream) + stream->PutCString(sb_stream->GetData()); + return true; + +} + PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedBreakpointResolver( const char *python_class_name, const char *session_dictionary_name, const StructuredDataImpl &args_impl, @@ -373,8 +500,9 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop( if (!pfunc.IsAllocated()) return true; - std::shared_ptr sb_stream = std::make_shared(); - PythonObject sb_stream_arg = SWIGBridge::ToSWIGWrapper(sb_stream); + auto *sb_stream = new lldb::SBStream(); + PythonObject sb_stream_arg = + SWIGBridge::ToSWIGWrapper(std::unique_ptr(sb_stream)); PythonObject result = pfunc(SWIGBridge::ToSWIGWrapper(std::move(exc_ctx_sp)), sb_stream_arg); @@ -389,7 +517,6 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop( // makes an internally help StreamString which I can't interpose, so I // have to copy it over here. stream->PutCString(sb_stream->GetData()); - sb_stream_arg.release(); if (result.get() == Py_False) return false; @@ -626,30 +753,6 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject * data return sb_ptr; } -void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject * data) { - lldb::SBEvent *sb_ptr = nullptr; - - int valid_cast = - SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBEvent, 0); - - if (valid_cast == -1) - return NULL; - - return sb_ptr; -} - -void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject * data) { - lldb::SBStream *sb_ptr = nullptr; - - int valid_cast = - SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBStream, 0); - - if (valid_cast == -1) - return NULL; - - return sb_ptr; -} - void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject * data) { lldb::SBValue *sb_ptr = NULL; diff --git a/lldb/include/lldb/API/SBEvent.h b/lldb/include/lldb/API/SBEvent.h index 85b401ca8cc1..cc116766e85f 100644 --- a/lldb/include/lldb/API/SBEvent.h +++ b/lldb/include/lldb/API/SBEvent.h @@ -15,7 +15,6 @@ #include namespace lldb_private { -class ScriptInterpreter; namespace python { class SWIGBridge; } @@ -74,12 +73,11 @@ class LLDB_API SBEvent { friend class SBThread; friend class SBWatchpoint; - friend class lldb_private::ScriptInterpreter; friend class lldb_private::python::SWIGBridge; SBEvent(lldb::EventSP &event_sp); - SBEvent(lldb_private::Event *event); + SBEvent(lldb_private::Event *event_sp); lldb::EventSP &GetSP() const; diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h index 2db379fe12f0..0e33f05b6991 100644 --- a/lldb/include/lldb/API/SBStream.h +++ b/lldb/include/lldb/API/SBStream.h @@ -13,10 +13,6 @@ #include "lldb/API/SBDefines.h" -namespace lldb_private { -class ScriptInterpreter; -} // namespace lldb_private - namespace lldb { class LLDB_API SBStream { @@ -105,8 +101,6 @@ class LLDB_API SBStream { friend class SBValue; friend class SBWatchpoint; - friend class lldb_private::ScriptInterpreter; - lldb_private::Stream *operator->(); lldb_private::Stream *get(); diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h index 69504dbcda5d..9753a916243b 100644 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h @@ -10,6 +10,7 @@ #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H #include "lldb/Core/StructuredDataImpl.h" +#include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/UnimplementedError.h" @@ -51,8 +52,7 @@ class ScriptedInterface { } template - static bool CheckStructuredDataObject(llvm::StringRef caller, T obj, - Status &error) { + bool CheckStructuredDataObject(llvm::StringRef caller, T obj, Status &error) { if (!obj) return ErrorWithMessage(caller, "Null Structured Data object", error); diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h deleted file mode 100644 index 9130f9412cb0..000000000000 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h +++ /dev/null @@ -1,38 +0,0 @@ -//===-- ScriptedThreadPlanInterface.h ---------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H -#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H - -#include "lldb/lldb-private.h" - -#include "ScriptedInterface.h" - -namespace lldb_private { -class ScriptedThreadPlanInterface : public ScriptedInterface { -public: - virtual llvm::Expected - CreatePluginObject(llvm::StringRef class_name, - lldb::ThreadPlanSP thread_plan_sp, - const StructuredDataImpl &args_sp) = 0; - - virtual llvm::Expected ExplainsStop(Event *event) { return true; } - - virtual llvm::Expected ShouldStop(Event *event) { return true; } - - virtual llvm::Expected IsStale() { return true; }; - - virtual lldb::StateType GetRunState() { return lldb::eStateStepping; } - - virtual llvm::Expected GetStopDescription(lldb_private::Stream *s) { - return true; - } -}; -} // namespace lldb_private - -#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index e821a7db2c67..932eaa8b8a4a 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -13,10 +13,8 @@ #include "lldb/API/SBBreakpoint.h" #include "lldb/API/SBData.h" #include "lldb/API/SBError.h" -#include "lldb/API/SBEvent.h" #include "lldb/API/SBLaunchInfo.h" #include "lldb/API/SBMemoryRegionInfo.h" -#include "lldb/API/SBStream.h" #include "lldb/Breakpoint/BreakpointOptions.h" #include "lldb/Core/PluginInterface.h" #include "lldb/Core/SearchFilter.h" @@ -252,6 +250,50 @@ class ScriptInterpreter : public PluginInterface { return lldb::ValueObjectListSP(); } + virtual StructuredData::ObjectSP + CreateScriptedThreadPlan(const char *class_name, + const StructuredDataImpl &args_data, + std::string &error_str, + lldb::ThreadPlanSP thread_plan_sp) { + return StructuredData::ObjectSP(); + } + + virtual bool + ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp, + Event *event, bool &script_error) { + script_error = true; + return true; + } + + virtual bool + ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp, + Event *event, bool &script_error) { + script_error = true; + return true; + } + + virtual bool + ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp, + bool &script_error) { + script_error = true; + return true; + } + + virtual lldb::StateType + ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp, + bool &script_error) { + script_error = true; + return lldb::eStateStepping; + } + + virtual bool + ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp, + lldb_private::Stream *stream, + bool &script_error) { + script_error = true; + return false; + } + virtual StructuredData::GenericSP CreateScriptedBreakpointResolver(const char *class_name, const StructuredDataImpl &args_data, @@ -550,11 +592,6 @@ class ScriptInterpreter : public PluginInterface { return {}; } - virtual lldb::ScriptedThreadPlanInterfaceSP - CreateScriptedThreadPlanInterface() { - return {}; - } - virtual lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() { return {}; } @@ -573,10 +610,6 @@ class ScriptInterpreter : public PluginInterface { Status GetStatusFromSBError(const lldb::SBError &error) const; - Event *GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const; - - Stream *GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const; - lldb::BreakpointSP GetOpaqueTypeFromSBBreakpoint(const lldb::SBBreakpoint &breakpoint) const; diff --git a/lldb/include/lldb/Target/ThreadPlanPython.h b/lldb/include/lldb/Target/ThreadPlanPython.h index da106faf951d..64854d66b8f2 100644 --- a/lldb/include/lldb/Target/ThreadPlanPython.h +++ b/lldb/include/lldb/Target/ThreadPlanPython.h @@ -13,7 +13,6 @@ #include #include "lldb/Core/StructuredDataImpl.h" -#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h" #include "lldb/Target/Process.h" #include "lldb/Target/StopInfo.h" #include "lldb/Target/Target.h" @@ -71,7 +70,6 @@ class ThreadPlanPython : public ThreadPlan { StreamString m_stop_description; // Cache the stop description here bool m_did_push; bool m_stop_others; - lldb::ScriptedThreadPlanInterfaceSP m_interface; ThreadPlanPython(const ThreadPlanPython &) = delete; const ThreadPlanPython &operator=(const ThreadPlanPython &) = delete; diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index e2b24819bce9..10ba921b9dac 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -187,7 +187,6 @@ class ScriptedMetadata; class ScriptedPlatformInterface; class ScriptedProcessInterface; class ScriptedThreadInterface; -class ScriptedThreadPlanInterface; class ScriptedSyntheticChildren; class SearchFilter; class Section; @@ -404,8 +403,6 @@ typedef std::unique_ptr ScriptedProcessInterfaceUP; typedef std::shared_ptr ScriptedThreadInterfaceSP; -typedef std::shared_ptr - ScriptedThreadPlanInterfaceSP; typedef std::shared_ptr SectionSP; typedef std::unique_ptr SectionListUP; typedef std::weak_ptr SectionWP; diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 75b2a39a8d11..8dd499ce819a 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -101,19 +101,6 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const { return Status(); } -Event * -ScriptInterpreter::GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const { - return event.m_opaque_ptr; -} - -Stream *ScriptInterpreter::GetOpaqueTypeFromSBStream( - const lldb::SBStream &stream) const { - if (stream.m_opaque_up) - return const_cast(stream).m_opaque_up.get(); - - return nullptr; -} - std::optional ScriptInterpreter::GetOpaqueTypeFromSBMemoryRegionInfo( const lldb::SBMemoryRegionInfo &mem_region) const { diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt index c60e4bb503a3..b22abc49c92a 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt @@ -24,7 +24,6 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces ScriptedPythonInterface.cpp ScriptedProcessPythonInterface.cpp ScriptedThreadPythonInterface.cpp - ScriptedThreadPlanPythonInterface.cpp ScriptedPlatformPythonInterface.cpp LINK_LIBS diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp index 6e93bec80056..9ba4731032bd 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp @@ -20,8 +20,6 @@ #include "../ScriptInterpreterPythonImpl.h" #include "ScriptedPlatformPythonInterface.h" -#include "lldb/Target/ExecutionContext.h" - using namespace lldb; using namespace lldb_private; using namespace lldb_private::python; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp index 313c597ce48f..e86b34d6b930 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp @@ -49,8 +49,7 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetCapabilities() { StructuredData::DictionarySP dict = Dispatch("get_capabilities", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) return {}; return dict; @@ -91,8 +90,7 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetThreadsInfo() { StructuredData::DictionarySP dict = Dispatch("get_threads_info", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) return {}; return dict; @@ -108,8 +106,7 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr, if (py_error.Fail()) error = py_error; - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return {}; return obj->GetBooleanValue(); @@ -134,8 +131,7 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress( StructuredData::ObjectSP obj = Dispatch("write_memory_at_address", py_error, addr, data_sp, error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return LLDB_INVALID_OFFSET; // If there was an error on the python call, surface it to the user. @@ -150,8 +146,7 @@ StructuredData::ArraySP ScriptedProcessPythonInterface::GetLoadedImages() { StructuredData::ArraySP array = Dispatch("get_loaded_images", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, error)) return {}; return array; @@ -161,8 +156,7 @@ lldb::pid_t ScriptedProcessPythonInterface::GetProcessID() { Status error; StructuredData::ObjectSP obj = Dispatch("get_process_id", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return LLDB_INVALID_PROCESS_ID; return obj->GetUnsignedIntegerValue(LLDB_INVALID_PROCESS_ID); @@ -172,8 +166,7 @@ bool ScriptedProcessPythonInterface::IsAlive() { Status error; StructuredData::ObjectSP obj = Dispatch("is_alive", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return {}; return obj->GetBooleanValue(); @@ -184,8 +177,7 @@ ScriptedProcessPythonInterface::GetScriptedThreadPluginName() { Status error; StructuredData::ObjectSP obj = Dispatch("get_scripted_thread_plugin", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return {}; return obj->GetStringValue().str(); @@ -201,8 +193,7 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() { StructuredData::DictionarySP dict = Dispatch("get_process_metadata", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) return {}; return dict; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index 7d072212676e..6f22503b279c 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -26,15 +26,6 @@ ScriptedPythonInterface::ScriptedPythonInterface( ScriptInterpreterPythonImpl &interpreter) : ScriptedInterface(), m_interpreter(interpreter) {} -template <> -void ScriptedPythonInterface::ReverseTransform( - lldb_private::Stream *&original_arg, python::PythonObject transformed_arg, - Status &error) { - Stream *s = ExtractValueFromPythonObject(transformed_arg, error); - *original_arg = *s; - original_arg->PutCString(static_cast(s)->GetData()); -} - template <> StructuredData::ArraySP ScriptedPythonInterface::ExtractValueFromPythonObject( @@ -57,33 +48,12 @@ Status ScriptedPythonInterface::ExtractValueFromPythonObject( if (lldb::SBError *sb_error = reinterpret_cast( python::LLDBSWIGPython_CastPyObjectToSBError(p.get()))) return m_interpreter.GetStatusFromSBError(*sb_error); - error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status."); + else + error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status."); return {}; } -template <> -Event *ScriptedPythonInterface::ExtractValueFromPythonObject( - python::PythonObject &p, Status &error) { - if (lldb::SBEvent *sb_event = reinterpret_cast( - python::LLDBSWIGPython_CastPyObjectToSBEvent(p.get()))) - return m_interpreter.GetOpaqueTypeFromSBEvent(*sb_event); - error.SetErrorString("Couldn't cast lldb::SBEvent to lldb_private::Event."); - - return nullptr; -} - -template <> -Stream *ScriptedPythonInterface::ExtractValueFromPythonObject( - python::PythonObject &p, Status &error) { - if (lldb::SBStream *sb_stream = reinterpret_cast( - python::LLDBSWIGPython_CastPyObjectToSBStream(p.get()))) - return m_interpreter.GetOpaqueTypeFromSBStream(*sb_stream); - error.SetErrorString("Couldn't cast lldb::SBStream to lldb_private::Stream."); - - return nullptr; -} - template <> lldb::DataExtractorSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index 062bf1fcff4a..163659234466 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -115,7 +115,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { PythonObject::ResolveNameWithDictionary( class_name, dict); if (!init.IsAllocated()) - return create_error(llvm::formatv("Could not find script class: {0}", + return create_error(llvm::formatv("Could not find script class: %s", class_name.data())); std::tuple original_args = std::forward_as_tuple(args...); @@ -248,11 +248,8 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { (PyObject *)m_object_instance_sp->GetValue()); if (!implementor.IsAllocated()) - return llvm::is_contained(GetAbstractMethods(), method_name) - ? ErrorWithMessage(caller_signature, - "Python implementor not allocated.", - error) - : T{}; + return ErrorWithMessage(caller_signature, + "Python implementor not allocated.", error); std::tuple original_args = std::forward_as_tuple(args...); auto transformed_args = TransformArgs(original_args); @@ -325,10 +322,6 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return python::SWIGBridge::ToSWIGWrapper(arg); } - python::PythonObject Transform(lldb::ThreadPlanSP arg) { - return python::SWIGBridge::ToSWIGWrapper(arg); - } - python::PythonObject Transform(lldb::ProcessAttachInfoSP arg) { return python::SWIGBridge::ToSWIGWrapper(arg); } @@ -337,14 +330,6 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return python::SWIGBridge::ToSWIGWrapper(arg); } - python::PythonObject Transform(Event *arg) { - return python::SWIGBridge::ToSWIGWrapper(arg); - } - - python::PythonObject Transform(Stream *arg) { - return python::SWIGBridge::ToSWIGWrapper(arg); - } - python::PythonObject Transform(lldb::DataExtractorSP arg) { return python::SWIGBridge::ToSWIGWrapper(arg); } @@ -442,14 +427,6 @@ template <> Status ScriptedPythonInterface::ExtractValueFromPythonObject( python::PythonObject &p, Status &error); -template <> -Event *ScriptedPythonInterface::ExtractValueFromPythonObject( - python::PythonObject &p, Status &error); - -template <> -Stream *ScriptedPythonInterface::ExtractValueFromPythonObject( - python::PythonObject &p, Status &error); - template <> lldb::BreakpointSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp deleted file mode 100644 index b7e475812f22..000000000000 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//===-- ScriptedThreadPlanPythonInterface.cpp -----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "lldb/Host/Config.h" -#include "lldb/Utility/Log.h" -#include "lldb/lldb-enumerations.h" - -#if LLDB_ENABLE_PYTHON - -// LLDB Python header must be included first -#include "../lldb-python.h" - -#include "../SWIGPythonBridge.h" -#include "../ScriptInterpreterPythonImpl.h" -#include "ScriptedThreadPlanPythonInterface.h" - -using namespace lldb; -using namespace lldb_private; -using namespace lldb_private::python; - -ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface( - ScriptInterpreterPythonImpl &interpreter) - : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {} - -llvm::Expected -ScriptedThreadPlanPythonInterface::CreatePluginObject( - const llvm::StringRef class_name, lldb::ThreadPlanSP thread_plan_sp, - const StructuredDataImpl &args_sp) { - return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr, - thread_plan_sp, args_sp); -} - -llvm::Expected -ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) { - Status error; - StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event); - - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) { - if (!obj) - return false; - return error.ToError(); - } - - return obj->GetBooleanValue(); -} - -llvm::Expected -ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) { - Status error; - StructuredData::ObjectSP obj = Dispatch("should_stop", error, event); - - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) { - if (!obj) - return false; - return error.ToError(); - } - - return obj->GetBooleanValue(); -} - -llvm::Expected ScriptedThreadPlanPythonInterface::IsStale() { - Status error; - StructuredData::ObjectSP obj = Dispatch("is_stale", error); - - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) { - if (!obj) - return false; - return error.ToError(); - } - - return obj->GetBooleanValue(); -} - -lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() { - Status error; - StructuredData::ObjectSP obj = Dispatch("should_step", error); - - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) - return lldb::eStateStepping; - - return static_cast(obj->GetUnsignedIntegerValue( - static_cast(lldb::eStateStepping))); -} - -llvm::Expected -ScriptedThreadPlanPythonInterface::GetStopDescription(lldb_private::Stream *s) { - Status error; - Dispatch("stop_description", error, s); - - if (error.Fail()) - return error.ToError(); - - return true; -} - -#endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h deleted file mode 100644 index 33f086786c47..000000000000 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h +++ /dev/null @@ -1,48 +0,0 @@ -//===-- ScriptedThreadPlanPythonInterface.h ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H -#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H - -#include "lldb/Host/Config.h" - -#if LLDB_ENABLE_PYTHON - -#include "ScriptedPythonInterface.h" -#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h" -#include - -namespace lldb_private { -class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface, - public ScriptedPythonInterface { -public: - ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter); - - llvm::Expected - CreatePluginObject(const llvm::StringRef class_name, - lldb::ThreadPlanSP thread_plan_sp, - const StructuredDataImpl &args_sp) override; - - llvm::SmallVector GetAbstractMethods() const override { - return {}; - } - - llvm::Expected ExplainsStop(Event *event) override; - - llvm::Expected ShouldStop(Event *event) override; - - llvm::Expected IsStale() override; - - lldb::StateType GetRunState() override; - - llvm::Expected GetStopDescription(lldb_private::Stream *s) override; -}; -} // namespace lldb_private - -#endif // LLDB_ENABLE_PYTHON -#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp index 8af89d761764..18e268527eb2 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "lldb/Host/Config.h" -#include "lldb/Target/ExecutionContext.h" #include "lldb/Utility/Log.h" #include "lldb/lldb-enumerations.h" @@ -45,8 +44,7 @@ lldb::tid_t ScriptedThreadPythonInterface::GetThreadID() { Status error; StructuredData::ObjectSP obj = Dispatch("get_thread_id", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return LLDB_INVALID_THREAD_ID; return obj->GetUnsignedIntegerValue(LLDB_INVALID_THREAD_ID); @@ -56,8 +54,7 @@ std::optional ScriptedThreadPythonInterface::GetName() { Status error; StructuredData::ObjectSP obj = Dispatch("get_name", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return {}; return obj->GetStringValue().str(); @@ -67,8 +64,7 @@ lldb::StateType ScriptedThreadPythonInterface::GetState() { Status error; StructuredData::ObjectSP obj = Dispatch("get_state", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return eStateInvalid; return static_cast(obj->GetUnsignedIntegerValue(eStateInvalid)); @@ -78,8 +74,7 @@ std::optional ScriptedThreadPythonInterface::GetQueue() { Status error; StructuredData::ObjectSP obj = Dispatch("get_queue", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return {}; return obj->GetStringValue().str(); @@ -90,8 +85,7 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetStopReason() { StructuredData::DictionarySP dict = Dispatch("get_stop_reason", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) return {}; return dict; @@ -102,8 +96,7 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetStackFrames() { StructuredData::ArraySP arr = Dispatch("get_stackframes", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error)) return {}; return arr; @@ -114,8 +107,7 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetRegisterInfo() { StructuredData::DictionarySP dict = Dispatch("get_register_info", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error)) return {}; return dict; @@ -125,8 +117,7 @@ std::optional ScriptedThreadPythonInterface::GetRegisterContext() { Status error; StructuredData::ObjectSP obj = Dispatch("get_register_context", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error)) return {}; return obj->GetAsString()->GetValue().str(); @@ -137,8 +128,7 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetExtendedInfo() { StructuredData::ArraySP arr = Dispatch("get_extended_info", error); - if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, - error)) + if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error)) return {}; return arr; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 95eb5a782097..c1a11b9134d6 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -97,14 +97,12 @@ class SWIGBridge { static PythonObject ToSWIGWrapper(lldb::ExecutionContextRefSP ctx_sp); static PythonObject ToSWIGWrapper(const TypeSummaryOptions &summary_options); static PythonObject ToSWIGWrapper(const SymbolContext &sym_ctx); - static PythonObject ToSWIGWrapper(const Stream *stream); - static PythonObject ToSWIGWrapper(std::shared_ptr stream_sb); - static PythonObject ToSWIGWrapper(Event *event); static PythonObject ToSWIGWrapper(lldb::ProcessAttachInfoSP attach_info_sp); static PythonObject ToSWIGWrapper(lldb::ProcessLaunchInfoSP launch_info_sp); static PythonObject ToSWIGWrapper(lldb::DataExtractorSP data_extractor_sp); + static PythonObject ToSWIGWrapper(std::unique_ptr stream_sb); static PythonObject ToSWIGWrapper(std::unique_ptr data_sb); static PythonObject @@ -114,6 +112,7 @@ class SWIGBridge { static python::ScopedPythonObject ToSWIGWrapper(CommandReturnObject &cmd_retobj); + static python::ScopedPythonObject ToSWIGWrapper(Event *event); // These prototypes are the Pythonic implementations of the required // callbacks. Although these are scripting-language specific, their definition // depends on the public API. @@ -148,6 +147,21 @@ class SWIGBridge { const char *session_dictionary_name, lldb::DebuggerSP debugger_sp); + static python::PythonObject LLDBSwigPythonCreateScriptedThreadPlan( + const char *python_class_name, const char *session_dictionary_name, + const StructuredDataImpl &args_data, std::string &error_string, + const lldb::ThreadPlanSP &thread_plan_sp); + + static bool LLDBSWIGPythonCallThreadPlan(void *implementor, + const char *method_name, + lldb_private::Event *event_sp, + bool &got_error); + + static bool LLDBSWIGPythonCallThreadPlan(void *implementor, + const char *method_name, + lldb_private::Stream *stream, + bool &got_error); + static python::PythonObject LLDBSwigPythonCreateScriptedBreakpointResolver( const char *python_class_name, const char *session_dictionary_name, const StructuredDataImpl &args, const lldb::BreakpointSP &bkpt_sp); @@ -255,8 +269,6 @@ void *LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data); -void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data); -void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data); } // namespace python diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 58ef8f674f72..ce52f3595247 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -17,7 +17,6 @@ #include "Interfaces/OperatingSystemPythonInterface.h" #include "Interfaces/ScriptedPlatformPythonInterface.h" #include "Interfaces/ScriptedProcessPythonInterface.h" -#include "Interfaces/ScriptedThreadPlanPythonInterface.h" #include "Interfaces/ScriptedThreadPythonInterface.h" #include "PythonDataObjects.h" #include "PythonReadline.h" @@ -1538,11 +1537,6 @@ ScriptInterpreterPythonImpl::CreateScriptedThreadInterface() { return std::make_shared(*this); } -ScriptedThreadPlanInterfaceSP -ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() { - return std::make_shared(*this); -} - OperatingSystemInterfaceSP ScriptInterpreterPythonImpl::CreateOperatingSystemInterface() { return std::make_shared(*this); @@ -1559,6 +1553,122 @@ ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject( return py_obj.CreateStructuredObject(); } +StructuredData::ObjectSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlan( + const char *class_name, const StructuredDataImpl &args_data, + std::string &error_str, lldb::ThreadPlanSP thread_plan_sp) { + if (class_name == nullptr || class_name[0] == '\0') + return StructuredData::ObjectSP(); + + if (!thread_plan_sp.get()) + return {}; + + Debugger &debugger = thread_plan_sp->GetTarget().GetDebugger(); + ScriptInterpreterPythonImpl *python_interpreter = + GetPythonInterpreter(debugger); + + if (!python_interpreter) + return {}; + + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + PythonObject ret_val = SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan( + class_name, python_interpreter->m_dictionary_name.c_str(), args_data, + error_str, thread_plan_sp); + if (!ret_val) + return {}; + + return StructuredData::ObjectSP( + new StructuredPythonObject(std::move(ret_val))); +} + +bool ScriptInterpreterPythonImpl::ScriptedThreadPlanExplainsStop( + StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) { + bool explains_stop = true; + StructuredData::Generic *generic = nullptr; + if (implementor_sp) + generic = implementor_sp->GetAsGeneric(); + if (generic) { + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + explains_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan( + generic->GetValue(), "explains_stop", event, script_error); + if (script_error) + return true; + } + return explains_stop; +} + +bool ScriptInterpreterPythonImpl::ScriptedThreadPlanShouldStop( + StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) { + bool should_stop = true; + StructuredData::Generic *generic = nullptr; + if (implementor_sp) + generic = implementor_sp->GetAsGeneric(); + if (generic) { + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + should_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan( + generic->GetValue(), "should_stop", event, script_error); + if (script_error) + return true; + } + return should_stop; +} + +bool ScriptInterpreterPythonImpl::ScriptedThreadPlanIsStale( + StructuredData::ObjectSP implementor_sp, bool &script_error) { + bool is_stale = true; + StructuredData::Generic *generic = nullptr; + if (implementor_sp) + generic = implementor_sp->GetAsGeneric(); + if (generic) { + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + is_stale = SWIGBridge::LLDBSWIGPythonCallThreadPlan( + generic->GetValue(), "is_stale", (Event *)nullptr, script_error); + if (script_error) + return true; + } + return is_stale; +} + +lldb::StateType ScriptInterpreterPythonImpl::ScriptedThreadPlanGetRunState( + StructuredData::ObjectSP implementor_sp, bool &script_error) { + bool should_step = false; + StructuredData::Generic *generic = nullptr; + if (implementor_sp) + generic = implementor_sp->GetAsGeneric(); + if (generic) { + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + should_step = SWIGBridge::LLDBSWIGPythonCallThreadPlan( + generic->GetValue(), "should_step", (Event *)nullptr, script_error); + if (script_error) + should_step = true; + } + if (should_step) + return lldb::eStateStepping; + return lldb::eStateRunning; +} + +bool +ScriptInterpreterPythonImpl::ScriptedThreadPlanGetStopDescription( + StructuredData::ObjectSP implementor_sp, lldb_private::Stream *stream, + bool &script_error) { + StructuredData::Generic *generic = nullptr; + if (implementor_sp) + generic = implementor_sp->GetAsGeneric(); + if (!generic) { + script_error = true; + return false; + } + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + return SWIGBridge::LLDBSWIGPythonCallThreadPlan( + generic->GetValue(), "stop_description", stream, script_error); +} + + StructuredData::GenericSP ScriptInterpreterPythonImpl::CreateScriptedBreakpointResolver( const char *class_name, const StructuredDataImpl &args_data, diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index fa2354053473..fcd21dff612b 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -77,9 +77,34 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { StructuredData::GenericSP CreateScriptCommandObject(const char *class_name) override; + StructuredData::ObjectSP + CreateScriptedThreadPlan(const char *class_name, + const StructuredDataImpl &args_data, + std::string &error_str, + lldb::ThreadPlanSP thread_plan) override; + StructuredData::ObjectSP CreateStructuredDataFromScriptObject(ScriptObject obj) override; + bool ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp, + Event *event, + bool &script_error) override; + + bool ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp, + Event *event, bool &script_error) override; + + bool ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp, + bool &script_error) override; + + lldb::StateType + ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp, + bool &script_error) override; + + bool + ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp, + lldb_private::Stream *s, + bool &script_error) override; + StructuredData::GenericSP CreateScriptedBreakpointResolver(const char *class_name, const StructuredDataImpl &args_data, @@ -111,9 +136,6 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override; - lldb::ScriptedThreadPlanInterfaceSP - CreateScriptedThreadPlanInterface() override; - lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() override; StructuredData::ObjectSP diff --git a/lldb/source/Target/ThreadPlanPython.cpp b/lldb/source/Target/ThreadPlanPython.cpp index 65d1737c2dc5..d6de6b3c3cf0 100644 --- a/lldb/source/Target/ThreadPlanPython.cpp +++ b/lldb/source/Target/ThreadPlanPython.cpp @@ -10,7 +10,6 @@ #include "lldb/Core/Debugger.h" #include "lldb/Interpreter/CommandInterpreter.h" -#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h" #include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Target/Process.h" #include "lldb/Target/RegisterContext.h" @@ -33,23 +32,6 @@ ThreadPlanPython::ThreadPlanPython(Thread &thread, const char *class_name, eVoteNoOpinion, eVoteNoOpinion), m_class_name(class_name), m_args_data(args_data), m_did_push(false), m_stop_others(false) { - ScriptInterpreter *interpreter = GetScriptInterpreter(); - if (!interpreter) { - SetPlanComplete(false); - // FIXME: error handling - return; - } - - m_interface = interpreter->CreateScriptedThreadPlanInterface(); - if (!m_interface) { - SetPlanComplete(false); - // FIXME: error handling - // error.SetErrorStringWithFormat( - // "ThreadPlanPython::%s () - ERROR: %s", __FUNCTION__, - // "Script interpreter couldn't create Scripted Thread Plan Interface"); - return; - } - SetIsControllingPlan(true); SetOkayToDiscard(true); SetPrivate(false); @@ -78,14 +60,13 @@ void ThreadPlanPython::DidPush() { // We set up the script side in DidPush, so that it can push other plans in // the constructor, and doesn't have to care about the details of DidPush. m_did_push = true; - if (m_interface) { - auto obj_or_err = m_interface->CreatePluginObject( - m_class_name, this->shared_from_this(), m_args_data); - if (!obj_or_err) { - m_error_str = llvm::toString(obj_or_err.takeError()); - SetPlanComplete(false); - } else - m_implementation_sp = *obj_or_err; + if (!m_class_name.empty()) { + ScriptInterpreter *script_interp = GetScriptInterpreter(); + if (script_interp) { + m_implementation_sp = script_interp->CreateScriptedThreadPlan( + m_class_name.c_str(), m_args_data, m_error_str, + this->shared_from_this()); + } } } @@ -96,13 +77,14 @@ bool ThreadPlanPython::ShouldStop(Event *event_ptr) { bool should_stop = true; if (m_implementation_sp) { - auto should_stop_or_err = m_interface->ShouldStop(event_ptr); - if (!should_stop_or_err) { - LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), should_stop_or_err.takeError(), - "Can't call ScriptedThreadPlan::ShouldStop."); - SetPlanComplete(false); - } else - should_stop = *should_stop_or_err; + ScriptInterpreter *script_interp = GetScriptInterpreter(); + if (script_interp) { + bool script_error; + should_stop = script_interp->ScriptedThreadPlanShouldStop( + m_implementation_sp, event_ptr, script_error); + if (script_error) + SetPlanComplete(false); + } } return should_stop; } @@ -114,13 +96,14 @@ bool ThreadPlanPython::IsPlanStale() { bool is_stale = true; if (m_implementation_sp) { - auto is_stale_or_err = m_interface->IsStale(); - if (!is_stale_or_err) { - LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), is_stale_or_err.takeError(), - "Can't call ScriptedThreadPlan::IsStale."); - SetPlanComplete(false); - } else - is_stale = *is_stale_or_err; + ScriptInterpreter *script_interp = GetScriptInterpreter(); + if (script_interp) { + bool script_error; + is_stale = script_interp->ScriptedThreadPlanIsStale(m_implementation_sp, + script_error); + if (script_error) + SetPlanComplete(false); + } } return is_stale; } @@ -132,14 +115,14 @@ bool ThreadPlanPython::DoPlanExplainsStop(Event *event_ptr) { bool explains_stop = true; if (m_implementation_sp) { - auto explains_stop_or_error = m_interface->ExplainsStop(event_ptr); - if (!explains_stop_or_error) { - LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), - explains_stop_or_error.takeError(), - "Can't call ScriptedThreadPlan::ExplainsStop."); - SetPlanComplete(false); - } else - explains_stop = *explains_stop_or_error; + ScriptInterpreter *script_interp = GetScriptInterpreter(); + if (script_interp) { + bool script_error; + explains_stop = script_interp->ScriptedThreadPlanExplainsStop( + m_implementation_sp, event_ptr, script_error); + if (script_error) + SetPlanComplete(false); + } } return explains_stop; } @@ -167,8 +150,14 @@ lldb::StateType ThreadPlanPython::GetPlanRunState() { LLDB_LOGF(log, "%s called on Python Thread Plan: %s )", LLVM_PRETTY_FUNCTION, m_class_name.c_str()); lldb::StateType run_state = eStateRunning; - if (m_implementation_sp) - run_state = m_interface->GetRunState(); + if (m_implementation_sp) { + ScriptInterpreter *script_interp = GetScriptInterpreter(); + if (script_interp) { + bool script_error; + run_state = script_interp->ScriptedThreadPlanGetRunState( + m_implementation_sp, script_error); + } + } return run_state; } @@ -179,13 +168,12 @@ void ThreadPlanPython::GetDescription(Stream *s, lldb::DescriptionLevel level) { if (m_implementation_sp) { ScriptInterpreter *script_interp = GetScriptInterpreter(); if (script_interp) { - auto desc_or_err = m_interface->GetStopDescription(s); - if (!desc_or_err || !*desc_or_err) { - LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), desc_or_err.takeError(), - "Can't call ScriptedThreadPlan::GetStopDescription."); + bool script_error; + bool added_desc = script_interp->ScriptedThreadPlanGetStopDescription( + m_implementation_sp, s, script_error); + if (script_error || !added_desc) s->Printf("Python thread plan implemented by class %s.", m_class_name.c_str()); - } } return; } diff --git a/lldb/test/API/functionalities/step_scripted/Steps.py b/lldb/test/API/functionalities/step_scripted/Steps.py index 3325dba75365..7527607be847 100644 --- a/lldb/test/API/functionalities/step_scripted/Steps.py +++ b/lldb/test/API/functionalities/step_scripted/Steps.py @@ -47,7 +47,7 @@ def queue_child_thread_plan(self): # This plan does a step-over until a variable changes value. class StepUntil(StepWithChild): - def __init__(self, thread_plan, args_data): + def __init__(self, thread_plan, args_data, dict): self.thread_plan = thread_plan self.frame = thread_plan.GetThread().frames[0] self.target = thread_plan.GetThread().GetProcess().GetTarget() @@ -99,7 +99,7 @@ def stop_description(self, stream): class StepReportsStopOthers: stop_mode_dict = {} - def __init__(self, thread_plan, args_data): + def __init__(self, thread_plan, args_data, dict): self.thread_plan = thread_plan self.key = str(args_data.GetValueForKey("token").GetUnsignedIntegerValue(1000)) diff --git a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py index ebb795abfa0e..802aaf2d3ffd 100644 --- a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py +++ b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py @@ -2,7 +2,7 @@ class WrapStepOver: - def __init__(self, thread_plan, args_data): + def __init__(self, thread_plan, args_data, dict): self.plan = thread_plan thread = thread_plan.GetThread() target = thread.GetProcess().GetTarget() diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 017953b372e3..23162436d42c 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -80,6 +80,26 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject( return python::PythonObject(); } +python::PythonObject +lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan( + const char *python_class_name, const char *session_dictionary_name, + const StructuredDataImpl &args_data, std::string &error_string, + const lldb::ThreadPlanSP &thread_plan_sp) { + return python::PythonObject(); +} + +bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( + void *implementor, const char *method_name, Event *event_sp, + bool &got_error) { + return false; +} + +bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan( + void *implementor, const char *method_name, Stream *event_sp, + bool &got_error) { + return false; +} + python::PythonObject lldb_private::python::SWIGBridge:: LLDBSwigPythonCreateScriptedBreakpointResolver( const char *python_class_name, const char *session_dictionary_name, @@ -134,16 +154,6 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject *data) { return nullptr; } -void * -lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data) { - return nullptr; -} - -void * -lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) { - return nullptr; -} - void * lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data) { return nullptr; @@ -301,11 +311,6 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ExecutionContextRefSP) { return python::PythonObject(); } -python::PythonObject -lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP) { - return python::PythonObject(); -} - python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) { return python::PythonObject(); @@ -315,18 +320,3 @@ python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper( const lldb_private::StructuredDataImpl &) { return python::PythonObject(); } - -python::PythonObject -lldb_private::python::SWIGBridge::ToSWIGWrapper(Event *event) { - return python::PythonObject(); -} - -python::PythonObject -lldb_private::python::SWIGBridge::ToSWIGWrapper(const Stream *stream) { - return python::PythonObject(); -} - -python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper( - std::shared_ptr stream_sb) { - return python::PythonObject(); -} From bf4d99e16789dd711eb61b36ce92b8519f450dd5 Mon Sep 17 00:00:00 2001 From: Mubashar Ahmad Date: Thu, 23 May 2024 09:59:47 +0100 Subject: [PATCH 008/433] [mlir][vector] Add deinterleave operation to vector dialect (#92409) The deinterleave operation constructs two vectors from a single input vector. The first result vector contains the elements from even indexes of the input, and the second contains elements from odd indexes. This is the inverse of a `vector.interleave` operation. Each output's trailing dimension is half of the size of the input vector's trailing dimension. This operation requires the input vector to have a rank > 0 and an even number of elements in its trailing dimension. The operation supports scalable vectors. Example: ```mlir %0, %1 = vector.deinterleave %a : vector<8xi8> -> vector<4xi8> %2, %3 = vector.deinterleave %b : vector<2x8xi8> -> vector<2x4xi8> %4, %5 = vector.deinterleave %c : vector<2x8x4xi8> -> vector<2x8x2xi8> %6, %7 = vector.deinterleave %d : vector<[8]xf32> -> vector<[4]xf32> %8, %9 = vector.deinterleave %e : vector<2x[6]xf64> -> vector<2x[3]xf64> %10, %11 = vector.deinterleave %f : vector<2x4x[6]xf64> -> vector<2x4x[3]xf64> ``` --- .../mlir/Dialect/Vector/IR/VectorOps.td | 80 +++++++++++++++++++ mlir/test/Dialect/Vector/invalid.mlir | 56 +++++++++++++ mlir/test/Dialect/Vector/ops.mlir | 42 ++++++++++ 3 files changed, 178 insertions(+) diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 332b5ad08ced..2bb7540ef0b0 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -543,6 +543,86 @@ def Vector_InterleaveOp : }]; } +class ResultIsHalfSourceVectorType : TypesMatchWith< + "the trailing dimension of the results is half the width of source trailing dimension", + "source", result, + [{ + [&]() -> ::mlir::VectorType { + auto vectorType = ::llvm::cast($_self); + ::mlir::VectorType::Builder builder(vectorType); + auto lastDim = vectorType.getRank() - 1; + auto newDimSize = vectorType.getDimSize(lastDim) / 2;; + if (newDimSize <= 0) + return vectorType; // (invalid input type) + return builder.setDim(lastDim, newDimSize); + }() + }] +>; + +def SourceVectorEvenElementCount : PredOpTrait< + "the trailing dimension of the source vector has an even number of elements", + CPred<[{ + [&](){ + auto srcVec = getSourceVectorType(); + return srcVec.getDimSize(srcVec.getRank() - 1) % 2 == 0; + }() + }]> +>; + +def Vector_DeinterleaveOp : + Vector_Op<"deinterleave", [Pure, + SourceVectorEvenElementCount, + ResultIsHalfSourceVectorType<"res1">, + AllTypesMatch<["res1", "res2"]> + ]> { + let summary = "constructs two vectors by deinterleaving an input vector"; + let description = [{ + The deinterleave operation constructs two vectors from a single input + vector. The first result vector contains the elements from even indexes + of the input, and the second contains elements from odd indexes. This is + the inverse of a `vector.interleave` operation. + + Each output's trailing dimension is half of the size of the input + vector's trailing dimension. This operation requires the input vector + to have a rank > 0 and an even number of elements in its trailing + dimension. + + The operation supports scalable vectors. + + Example: + ```mlir + %0, %1 = vector.deinterleave %a + : vector<8xi8> -> vector<4xi8> + %2, %3 = vector.deinterleave %b + : vector<2x8xi8> -> vector<2x4xi8> + %4, %5 = vector.deinterleave %c + : vector<2x8x4xi8> -> vector<2x8x2xi8> + %6, %7 = vector.deinterleave %d + : vector<[8]xf32> -> vector<[4]xf32> + %8, %9 = vector.deinterleave %e + : vector<2x[6]xf64> -> vector<2x[3]xf64> + %10, %11 = vector.deinterleave %f + : vector<2x4x[6]xf64> -> vector<2x4x[3]xf64> + ``` + }]; + + let arguments = (ins AnyVector:$source); + let results = (outs AnyVector:$res1, AnyVector:$res2); + + let assemblyFormat = [{ + $source attr-dict `:` type($source) `->` type($res1) + }]; + + let extraClassDeclaration = [{ + VectorType getSourceVectorType() { + return ::llvm::cast(getSource().getType()); + } + VectorType getResultVectorType() { + return ::llvm::cast(getRes1().getType()); + } + }]; + } + def Vector_ExtractElementOp : Vector_Op<"extractelement", [Pure, TypesMatchWith<"result type matches element type of vector operand", diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index c9f7e9c6e2fb..1516f51fe145 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1798,3 +1798,59 @@ func.func @invalid_outerproduct1(%src : memref) { // expected-error @+1 {{'vector.outerproduct' op expected 1-d vector for operand #1}} %op = vector.outerproduct %0, %1 : vector<[4]x[4]xf32>, vector<[4]xf32> } + +// ----- + +func.func @deinterleave_zero_dim_fail(%vec : vector) { + // expected-error @+1 {{'vector.deinterleave' op operand #0 must be vector of any type values, but got 'vector}} + %0, %1 = vector.deinterleave %vec : vector -> vector + return +} + +// ----- + +func.func @deinterleave_one_dim_fail(%vec : vector<1xf32>) { + // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the source vector has an even number of elements}} + %0, %1 = vector.deinterleave %vec : vector<1xf32> -> vector<1xf32> + return +} + +// ----- + +func.func @deinterleave_oversized_output_fail(%vec : vector<4xf32>) { + // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}} + %0, %1 = "vector.deinterleave" (%vec) : (vector<4xf32>) -> (vector<8xf32>, vector<8xf32>) + return +} + +// ----- + +func.func @deinterleave_output_dim_size_mismatch(%vec : vector<4xf32>) { + // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}} + %0, %1 = "vector.deinterleave" (%vec) : (vector<4xf32>) -> (vector<4xf32>, vector<2xf32>) + return +} + +// ----- + +func.func @deinterleave_n_dim_rank_fail(%vec : vector<2x3x4xf32>) { + // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}} + %0, %1 = "vector.deinterleave" (%vec) : (vector<2x3x4xf32>) -> (vector<2x3x4xf32>, vector<2x3x2xf32>) + return +} + +// ----- + +func.func @deinterleave_scalable_dim_size_fail(%vec : vector<2x[4]xf32>) { + // expected-error @+1 {{'vector.deinterleave' op failed to verify that all of {res1, res2} have same type}} + %0, %1 = "vector.deinterleave" (%vec) : (vector<2x[4]xf32>) -> (vector<2x[2]xf32>, vector<2x[1]xf32>) + return +} + +// ----- + +func.func @deinterleave_scalable_rank_fail(%vec : vector<2x[4]xf32>) { + // expected-error @+1 {{'vector.deinterleave' op failed to verify that all of {res1, res2} have same type}} + %0, %1 = "vector.deinterleave" (%vec) : (vector<2x[4]xf32>) -> (vector<2x[2]xf32>, vector<[2]xf32>) + return +} diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir index 79a80be4f8b2..9d8101d3eee9 100644 --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -1116,3 +1116,45 @@ func.func @interleave_2d_scalable(%a: vector<2x[2]xf64>, %b: vector<2x[2]xf64>) %0 = vector.interleave %a, %b : vector<2x[2]xf64> return %0 : vector<2x[4]xf64> } + +// CHECK-LABEL: @deinterleave_1d +func.func @deinterleave_1d(%arg: vector<4xf32>) -> (vector<2xf32>, vector<2xf32>) { + // CHECK: vector.deinterleave %{{.*}} : vector<4xf32> -> vector<2xf32> + %0, %1 = vector.deinterleave %arg : vector<4xf32> -> vector<2xf32> + return %0, %1 : vector<2xf32>, vector<2xf32> +} + +// CHECK-LABEL: @deinterleave_1d_scalable +func.func @deinterleave_1d_scalable(%arg: vector<[4]xf32>) -> (vector<[2]xf32>, vector<[2]xf32>) { + // CHECK: vector.deinterleave %{{.*}} : vector<[4]xf32> -> vector<[2]xf32> + %0, %1 = vector.deinterleave %arg : vector<[4]xf32> -> vector<[2]xf32> + return %0, %1 : vector<[2]xf32>, vector<[2]xf32> +} + +// CHECK-LABEL: @deinterleave_2d +func.func @deinterleave_2d(%arg: vector<3x4xf32>) -> (vector<3x2xf32>, vector<3x2xf32>) { + // CHECK: vector.deinterleave %{{.*}} : vector<3x4xf32> -> vector<3x2xf32> + %0, %1 = vector.deinterleave %arg : vector<3x4xf32> -> vector<3x2xf32> + return %0, %1 : vector<3x2xf32>, vector<3x2xf32> +} + +// CHECK-LABEL: @deinterleave_2d_scalable +func.func @deinterleave_2d_scalable(%arg: vector<3x[4]xf32>) -> (vector<3x[2]xf32>, vector<3x[2]xf32>) { + // CHECK: vector.deinterleave %{{.*}} : vector<3x[4]xf32> -> vector<3x[2]xf32> + %0, %1 = vector.deinterleave %arg : vector<3x[4]xf32> -> vector<3x[2]xf32> + return %0, %1 : vector<3x[2]xf32>, vector<3x[2]xf32> +} + +// CHECK-LABEL: @deinterleave_nd +func.func @deinterleave_nd(%arg: vector<2x3x4x6xf32>) -> (vector<2x3x4x3xf32>, vector<2x3x4x3xf32>) { + // CHECK: vector.deinterleave %{{.*}} : vector<2x3x4x6xf32> -> vector<2x3x4x3xf32> + %0, %1 = vector.deinterleave %arg : vector<2x3x4x6xf32> -> vector<2x3x4x3xf32> + return %0, %1 : vector<2x3x4x3xf32>, vector<2x3x4x3xf32> +} + +// CHECK-LABEL: @deinterleave_nd_scalable +func.func @deinterleave_nd_scalable(%arg:vector<2x3x4x[6]xf32>) -> (vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32>) { + // CHECK: vector.deinterleave %{{.*}} : vector<2x3x4x[6]xf32> -> vector<2x3x4x[3]xf32> + %0, %1 = vector.deinterleave %arg : vector<2x3x4x[6]xf32> -> vector<2x3x4x[3]xf32> + return %0, %1 : vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32> +} From 8930ba98e01bc66949e482b396f8389d64388359 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 23 May 2024 10:09:22 +0100 Subject: [PATCH 009/433] [clang][FMV] Allow declaration of function versions in namespaces. (#93044) Fixes the following bug: namespace Name { int __attribute((target_version("default"))) foo() { return 0; } } namespace Name { int __attribute((target_version("sve"))) foo() { return 1; } } int bar() { return Name::foo(); } error: redefinition of 'foo' int __attribute((target_version("sve"))) foo() { return 1; } note: previous definition is here int __attribute((target_version("default"))) foo() { return 0; } While fixing this I also found that in the absence of default version declaration, the one we implicitly create has incorrect mangling if we are in a namespace: namespace OtherName { int __attribute((target_version("sve"))) foo() { return 2; } } int baz() { return OtherName::foo(); } In this example instead of creating a declaration for the symbol @_ZN9OtherName3fooEv.default we are creating one for the symbol @_Z3foov.default (the namespace mangling prefix is omitted). This has now been fixed. --- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- clang/lib/Sema/SemaDecl.cpp | 4 +- clang/test/CodeGenCXX/fmv-namespace.cpp | 93 +++++++++++++++++++++++++ clang/test/Sema/fmv-namespace.cpp | 12 ++++ 4 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGenCXX/fmv-namespace.cpp create mode 100644 clang/test/Sema/fmv-namespace.cpp diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 60ef28a0effa..e4774a587707 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4150,7 +4150,7 @@ llvm::GlobalValue::LinkageTypes getMultiversionLinkage(CodeGenModule &CGM, } static FunctionDecl *createDefaultTargetVersionFrom(const FunctionDecl *FD) { - DeclContext *DeclCtx = FD->getASTContext().getTranslationUnitDecl(); + auto *DeclCtx = const_cast(FD->getDeclContext()); TypeSourceInfo *TInfo = FD->getTypeSourceInfo(); StorageClass SC = FD->getStorageClass(); DeclarationName Name = FD->getNameInfo().getName(); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index b5c3a27ab06e..2a87b26f17a2 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -11868,8 +11868,8 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD, return false; if (!OldDecl || !OldDecl->getAsFunction() || - OldDecl->getDeclContext()->getRedeclContext() != - NewFD->getDeclContext()->getRedeclContext()) { + !OldDecl->getDeclContext()->getRedeclContext()->Equals( + NewFD->getDeclContext()->getRedeclContext())) { // If there's no previous declaration, AND this isn't attempting to cause // multiversioning, this isn't an error condition. if (MVKind == MultiVersionKind::None) diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp new file mode 100644 index 000000000000..5bcd0da06eeb --- /dev/null +++ b/clang/test/CodeGenCXX/fmv-namespace.cpp @@ -0,0 +1,93 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5 +// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s + +namespace Name { +int __attribute((target_version("default"))) foo() { return 0; } +} + +namespace Name { +int __attribute((target_version("sve"))) foo() { return 1; } +} + +int bar() { return Name::foo(); } + +namespace OtherName { +int __attribute((target_version("sve"))) foo() { return 2; } +} + +int baz() { return OtherName::foo(); } + +//. +// CHECK: @__aarch64_cpu_features = external dso_local global { i64 } +// CHECK: @_ZN4Name3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN4Name3fooEv +// CHECK: @_ZN9OtherName3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN9OtherName3fooEv +// CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver +// CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver +//. +// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i32 0 +// +// +// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve( +// CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i32 1 +// +// +// CHECK-LABEL: define dso_local noundef i32 @_Z3barv( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN4Name3fooEv() +// CHECK-NEXT: ret i32 [[CALL]] +// +// +// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: +// CHECK-NEXT: ret ptr @_ZN4Name3fooEv._Msve +// CHECK: [[RESOLVER_ELSE]]: +// CHECK-NEXT: ret ptr @_ZN4Name3fooEv.default +// +// +// CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve( +// CHECK-SAME: ) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i32 2 +// +// +// CHECK-LABEL: define dso_local noundef i32 @_Z3bazv( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN9OtherName3fooEv() +// CHECK-NEXT: ret i32 [[CALL]] +// +// +// CHECK-LABEL: define weak_odr ptr @_ZN9OtherName3fooEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: +// CHECK-NEXT: ret ptr @_ZN9OtherName3fooEv._Msve +// CHECK: [[RESOLVER_ELSE]]: +// CHECK-NEXT: ret ptr @_ZN9OtherName3fooEv.default +// +//. +// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } +// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/Sema/fmv-namespace.cpp b/clang/test/Sema/fmv-namespace.cpp new file mode 100644 index 000000000000..1c12fd66cf24 --- /dev/null +++ b/clang/test/Sema/fmv-namespace.cpp @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify %s +// expected-no-diagnostics + +namespace Name { +int __attribute((target_version("default"))) foo() { return 0; } +} + +namespace Name { +int __attribute((target_version("sve"))) foo() { return 1; } +} + +int bar() { return Name::foo(); } From a2824632cba8e7d98a5cbf9acb3ca5d9960c95cb Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 23 May 2024 11:23:07 +0200 Subject: [PATCH 010/433] [lldb/DWARF] Make sure bad abbreviation codes do not crash lldb (#93006) We currently cannot represent abbreviation codes with more than 16 bits, and we were lldb-asserting if we ever ran into one. While I haven't seen any real DWARF with these kinds of abbreviations, it is possible to hit this with handcrafted evil dwarf, due some sort of corruptions, or just bugs (the addition of PeekDIEName makes these bugs more likely, as the function blindly dereferences offsets within the debug info section) . Missing abbreviations were already reporting an error. This patch turns sure that large abbreviations into an error as well, and adds a test for both cases. --- .../SymbolFile/DWARF/DWARFDebugInfoEntry.cpp | 40 ++++++++-------- .../SymbolFile/DWARF/DWARFDebugInfoEntry.h | 2 +- .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp | 6 +-- .../DWARF/x86/invalid_abbreviation.s | 47 +++++++++++++++++++ 4 files changed, 70 insertions(+), 25 deletions(-) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp index 1b0fefedf983..688a287a0650 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include "llvm/Support/LEB128.h" @@ -41,13 +42,23 @@ extern int g_verbose; // Extract a debug info entry for a given DWARFUnit from the data // starting at the offset in offset_ptr bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data, - const DWARFUnit *cu, + const DWARFUnit &unit, lldb::offset_t *offset_ptr) { m_offset = *offset_ptr; + auto report_error = [&](const char *fmt, const auto &...vals) { + unit.GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError( + "[{0:x16}]: {1}, please file a bug and " + "attach the file at the start of this error message", + static_cast(m_offset), llvm::formatv(fmt, vals...)); + *offset_ptr = std::numeric_limits::max(); + return false; + }; + m_parent_idx = 0; m_sibling_idx = 0; const uint64_t abbr_idx = data.GetULEB128(offset_ptr); - lldbassert(abbr_idx <= UINT16_MAX); + if (abbr_idx > std::numeric_limits::max()) + return report_error("abbreviation code {0} too big", abbr_idx); m_abbr_idx = abbr_idx; if (m_abbr_idx == 0) { @@ -56,31 +67,18 @@ bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data, return true; // NULL debug tag entry } - const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu); - if (abbrevDecl == nullptr) { - cu->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError( - "[{0:x16}]: invalid abbreviation code {1}, " - "please file a bug and " - "attach the file at the start of this error message", - (uint64_t)m_offset, (unsigned)abbr_idx); - // WE can't parse anymore if the DWARF is borked... - *offset_ptr = UINT32_MAX; - return false; - } + const auto *abbrevDecl = GetAbbreviationDeclarationPtr(&unit); + if (abbrevDecl == nullptr) + return report_error("invalid abbreviation code {0}", abbr_idx); + m_tag = abbrevDecl->getTag(); m_has_children = abbrevDecl->hasChildren(); // Skip all data in the .debug_info or .debug_types for the attributes for (const auto &attribute : abbrevDecl->attributes()) { - if (DWARFFormValue::SkipValue(attribute.Form, data, offset_ptr, cu)) + if (DWARFFormValue::SkipValue(attribute.Form, data, offset_ptr, &unit)) continue; - cu->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError( - "[{0:x16}]: Unsupported DW_FORM_{1:x}, please file a bug " - "and " - "attach the file at the start of this error message", - (uint64_t)m_offset, (unsigned)attribute.Form); - *offset_ptr = m_offset; - return false; + return report_error("Unsupported DW_FORM_{1:x}", attribute.Form); } return true; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h index c19fa7428549..6773b00e8206 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h @@ -49,7 +49,7 @@ class DWARFDebugInfoEntry { void BuildFunctionAddressRangeTable(DWARFUnit *cu, DWARFDebugAranges *debug_aranges) const; - bool Extract(const DWARFDataExtractor &data, const DWARFUnit *cu, + bool Extract(const DWARFDataExtractor &data, const DWARFUnit &cu, lldb::offset_t *offset_ptr); using Recurse = DWARFBaseDIE::Recurse; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp index 3a57ec970b07..66a762bf9b68 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp @@ -63,7 +63,7 @@ void DWARFUnit::ExtractUnitDIENoDwoIfNeeded() { // parse const DWARFDataExtractor &data = GetData(); if (offset < GetNextUnitOffset() && - m_first_die.Extract(data, this, &offset)) { + m_first_die.Extract(data, *this, &offset)) { AddUnitDIE(m_first_die); return; } @@ -242,7 +242,7 @@ void DWARFUnit::ExtractDIEsRWLocked() { die_index_stack.reserve(32); die_index_stack.push_back(0); bool prev_die_had_children = false; - while (offset < next_cu_offset && die.Extract(data, this, &offset)) { + while (offset < next_cu_offset && die.Extract(data, *this, &offset)) { const bool null_die = die.IsNULL(); if (depth == 0) { assert(m_die_array.empty() && "Compile unit DIE already added"); @@ -670,7 +670,7 @@ DWARFUnit::GetDIE(dw_offset_t die_offset) { llvm::StringRef DWARFUnit::PeekDIEName(dw_offset_t die_offset) { DWARFDebugInfoEntry die; - if (!die.Extract(GetData(), this, &die_offset)) + if (!die.Extract(GetData(), *this, &die_offset)) return llvm::StringRef(); // Does die contain a DW_AT_Name? diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s b/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s new file mode 100644 index 000000000000..3f32c037aeb2 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s @@ -0,0 +1,47 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t +# RUN: %lldb %t \ +# RUN: -o exit 2>&1 | FileCheck %s + +# CHECK-DAG: error: {{.*}} [0x0000000000000022]: abbreviation code 65536 too big, please file a bug and attach the file at the start of this error message +# CHECK-DAG: error: {{.*}} [0x0000000000000048]: invalid abbreviation code 47, please file a bug and attach the file at the start of this error message + + + .section .debug_abbrev,"",@progbits + .uleb128 65535 # Largest representable Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .uleb128 65535 # DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .uleb128 65536 # Unrepresentable abbreviation + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + + .section .debug_info,"",@progbits +.Lcu_begin1: + .long .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit +.Ldebug_info_start1: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .uleb128 65535 # DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .byte 47 # Missing abbreviation + .byte 0 # End Of Children Mark +.Ldebug_info_end1: From 8a3537bfefa295e12ddbdb59cb8f76281ce302a0 Mon Sep 17 00:00:00 2001 From: csstormq Date: Thu, 23 May 2024 17:27:07 +0800 Subject: [PATCH 011/433] [llvm][ScheduleDAG] SUnit::biasCriticalPath() does not find the critical path consistently (#93001) Patch co-authored by AtariDreams (gfunni234@gmail.com). Fixes #38037. [AMDGPU] Update test results to fix build (#92982) --- llvm/lib/CodeGen/ScheduleDAG.cpp | 4 +- llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 395 ++-- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 395 ++-- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 1592 ++++++++--------- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 1592 ++++++++--------- llvm/test/CodeGen/AMDGPU/shl.ll | 216 +-- .../test/CodeGen/X86/misched-critical-path.ll | 35 + 7 files changed, 2115 insertions(+), 2114 deletions(-) create mode 100644 llvm/test/CodeGen/X86/misched-critical-path.ll diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp index de8e6f63794d..8d9a5041fc2f 100644 --- a/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -331,8 +331,10 @@ void SUnit::biasCriticalPath() { unsigned MaxDepth = BestI->getSUnit()->getDepth(); for (SUnit::pred_iterator I = std::next(BestI), E = Preds.end(); I != E; ++I) { - if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth) + if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth) { + MaxDepth = I->getSUnit()->getDepth(); BestI = I; + } } if (BestI != Preds.begin()) std::swap(*Preds.begin(), *BestI); diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 64063f65e288..04ef30bd26aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -253,25 +253,25 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y, ; EG-NEXT: 8388608(1.175494e-38), -150(nan) ; EG-NEXT: ADD_INT T0.X, T0.W, literal.x, -; EG-NEXT: SUB_INT T0.Y, literal.y, T0.W, -; EG-NEXT: AND_INT T0.Z, PS, literal.z, +; EG-NEXT: AND_INT T0.Y, PS, literal.y, +; EG-NEXT: SUB_INT T0.Z, literal.z, T0.W, ; EG-NEXT: NOT_INT T0.W, PS, ; EG-NEXT: LSHR * T3.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) ; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T1.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT T1.Y, PV.Z, literal.x, +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z, +; EG-NEXT: LSHL T0.W, T1.W, PV.Y, +; EG-NEXT: AND_INT * T1.W, T2.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: CNDE_INT T0.W, PV.Z, PV.X, PV.Y, +; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0, +; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W, ; EG-NEXT: SETGT_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T0.W, PS, PV.Y, PV.Z, +; EG-NEXT: CNDE_INT T1.Z, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T0.W, PS, PV.Z, PV.Y, ; EG-NEXT: ASHR * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.W, PV.W, PS, @@ -364,79 +364,78 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; EG-LABEL: fp_to_sint_v2i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 75, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 74, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W, -; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.y, T0.W, -; EG-NEXT: ADD_INT * T2.W, PV.W, literal.z, -; EG-NEXT: 8388607(1.175494e-38), 23(3.222986e-44) +; EG-NEXT: BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T0.W, KC0[2].W, literal.x, PV.W, +; EG-NEXT: AND_INT * T1.Z, KC0[2].W, literal.y, +; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: ADD_INT T1.W, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, PV.Z, literal.x, ; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: SUB_INT T0.X, literal.x, PV.W, -; EG-NEXT: SUB_INT T0.Y, literal.x, T1.W, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: OR_INT T3.W, PV.Z, literal.z, +; EG-NEXT: AND_INT T0.X, PS, literal.x, +; EG-NEXT: AND_INT T0.Y, PV.W, literal.x, +; EG-NEXT: OR_INT T1.Z, T1.Z, literal.y, +; EG-NEXT: SUB_INT T3.W, literal.z, T0.W, ; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w, -; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44) -; EG-NEXT: 8388608(1.175494e-38), 8388607(1.175494e-38) +; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38) +; EG-NEXT: 150(2.101948e-43), 8388607(1.175494e-38) ; EG-NEXT: OR_INT T1.X, PS, literal.x, -; EG-NEXT: LSHL T1.Y, PV.W, PV.Z, -; EG-NEXT: AND_INT T0.Z, T2.W, literal.y, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y, -; EG-NEXT: AND_INT * T5.W, PV.Y, literal.y, +; EG-NEXT: AND_INT T1.Y, PV.W, literal.y, +; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W, +; EG-NEXT: LSHL T3.W, PV.Z, PV.Y, +; EG-NEXT: AND_INT * T4.W, T1.W, literal.y, ; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0, -; EG-NEXT: ADD_INT T1.Z, T0.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X, -; EG-NEXT: AND_INT * T5.W, T0.X, literal.y, -; EG-NEXT: -150(nan), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T2.Z, PV.Y, PV.Z, 0.0, +; EG-NEXT: LSHL T5.W, PV.X, T0.X, +; EG-NEXT: AND_INT * T6.W, T2.W, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0, -; EG-NEXT: NOT_INT T2.Y, T2.W, -; EG-NEXT: AND_INT T2.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T2.W, PV.Z, -; EG-NEXT: LSHR * T4.W, T1.X, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T3.X, T3.W, 1, -; EG-NEXT: ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T1.X, PV.Z, -; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y, +; EG-NEXT: NOT_INT T1.Y, T1.W, +; EG-NEXT: SUB_INT T3.Z, literal.x, T0.Z, +; EG-NEXT: NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T2.W, T1.X, 1, +; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, T1.Z, 1, +; EG-NEXT: ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z, +; EG-NEXT: AND_INT * T2.W, PV.Z, literal.y, ; EG-NEXT: -127(nan), 32(4.484155e-44) ; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y, -; EG-NEXT: ADD_INT * T1.W, T1.W, literal.y, +; EG-NEXT: CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGT_INT T0.Z, PV.Y, literal.x, +; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y, +; EG-NEXT: ADD_INT * T0.W, T0.W, literal.y, ; EG-NEXT: 23(3.222986e-44), -127(nan) -; EG-NEXT: CNDE_INT T3.X, T0.Z, PV.W, T1.Y, +; EG-NEXT: CNDE_INT T2.X, T4.W, PV.W, T3.W, ; EG-NEXT: SETGT_INT T1.Y, PS, literal.x, -; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T0.W, PV.Z, T0.X, PV.X, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T0.X, ; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y, ; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44) ; EG-NEXT: XOR_INT T0.X, PV.W, PS, -; EG-NEXT: XOR_INT T2.Y, PV.Z, PS, +; EG-NEXT: XOR_INT T3.Y, PV.Z, PS, ; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X, -; EG-NEXT: CNDE_INT T0.W, PV.Y, T2.X, T0.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Y, T2.Z, T0.Y, ; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.Y, PV.W, PS, ; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: SUB_INT T0.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W, ; EG-NEXT: SUB_INT T1.Y, PV.W, PS, -; EG-NEXT: SETGT_INT T1.Z, 0.0, T3.Y, -; EG-NEXT: SUB_INT T0.W, PV.Z, T3.W, +; EG-NEXT: SETGT_INT T1.Z, 0.0, T2.Y, +; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W, ; EG-NEXT: SUB_INT T0.Z, PV.W, PS, -; EG-NEXT: SETGT_INT T0.W, 0.0, T1.W, +; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W, ; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0, ; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0, ; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W, @@ -567,170 +566,168 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; ; EG-LABEL: fp_to_sint_v4i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: ALU 54, @108, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1 +; EG-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 54, @106, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, KC0[4].X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, KC0[4].X, literal.y, +; EG-NEXT: BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, KC0[3].Z, literal.y, ; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T0.Z, PS, literal.x, -; EG-NEXT: BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z, -; EG-NEXT: 8388608(1.175494e-38), 23(3.222986e-44) -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.Y, PV.W, literal.x, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.Z, 1, -; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: OR_INT T2.W, PS, literal.x, +; EG-NEXT: ADD_INT * T3.W, PV.W, literal.y, +; EG-NEXT: 8388608(1.175494e-38), -150(nan) ; EG-NEXT: ADD_INT T0.X, T1.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.y, BS:VEC_201 -; EG-NEXT: LSHL T3.W, T0.Z, PV.Z, -; EG-NEXT: SUB_INT * T1.W, literal.z, T1.W, -; EG-NEXT: -127(nan), 32(4.484155e-44) -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS, -; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.Y, PV.W, -; EG-NEXT: SETGT_INT * T4.W, PV.X, literal.z, +; EG-NEXT: BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W, +; EG-NEXT: AND_INT T0.Z, PS, literal.z, +; EG-NEXT: NOT_INT T4.W, PS, +; EG-NEXT: LSHR * T5.W, PV.W, 1, +; EG-NEXT: -127(nan), 23(3.222986e-44) +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W, +; EG-NEXT: AND_INT T1.Y, T3.W, literal.x, +; EG-NEXT: LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T3.W, KC0[4].X, literal.y, +; EG-NEXT: ADD_INT * T4.W, PV.Y, literal.z, ; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38) +; EG-NEXT: -150(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Y, PS, literal.x, +; EG-NEXT: OR_INT T1.Z, PV.W, literal.y, +; EG-NEXT: CNDE_INT T3.W, PV.Y, PV.X, PV.Z, +; EG-NEXT: SETGT_INT * T5.W, T0.X, literal.z, +; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.X, PS, 0.0, PV.W, -; EG-NEXT: OR_INT T1.Y, PV.Z, literal.x, -; EG-NEXT: ADD_INT T0.Z, T2.W, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.X, PV.Y, 0.0, -; EG-NEXT: CNDE_INT * T3.W, T2.Z, T3.W, 0.0, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: CNDE_INT T1.X, T4.W, PV.W, PS, -; EG-NEXT: ASHR T2.Y, KC0[4].X, literal.x, -; EG-NEXT: AND_INT T1.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T1.W, PV.Z, -; EG-NEXT: LSHR * T3.W, PV.Y, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T3.Y, T1.Y, PV.Z, -; EG-NEXT: XOR_INT T1.Z, PV.X, PV.Y, -; EG-NEXT: XOR_INT T1.W, T2.X, PV.Y, -; EG-NEXT: SUB_INT * T2.W, literal.x, T2.W, -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, T0.Z, literal.x, -; EG-NEXT: AND_INT T4.Y, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T1.W, PV.W, T2.Y, -; EG-NEXT: SUBB_UINT * T2.W, PV.Z, T2.Y, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T2.X, PV.W, PS, -; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.Z, 0.0, -; EG-NEXT: CNDE_INT T0.Z, PV.X, T3.Y, 0.0, -; EG-NEXT: CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122 -; EG-NEXT: SETGT_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: CNDE_INT T3.Y, PS, 0.0, PV.W, +; EG-NEXT: SUB_INT T2.Z, literal.x, T1.W, +; EG-NEXT: LSHL T1.W, PV.Z, PV.Y, +; EG-NEXT: AND_INT * T3.W, T4.W, literal.y, +; EG-NEXT: 150(2.101948e-43), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, +; EG-NEXT: AND_INT T2.Y, PV.Z, literal.x, +; EG-NEXT: SUB_INT T3.Z, literal.y, T0.Y, +; EG-NEXT: NOT_INT T4.W, T4.W, +; EG-NEXT: LSHR * T6.W, T1.Z, 1, +; EG-NEXT: 32(4.484155e-44), 150(2.101948e-43) +; EG-NEXT: BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z, +; EG-NEXT: ADD_INT T0.Y, T0.Y, literal.x, +; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z, +; EG-NEXT: AND_INT * T4.W, PV.Z, literal.y, +; EG-NEXT: -127(nan), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T4.Y, T3.W, PV.Z, T1.W, +; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: CNDE_INT T1.W, T1.Y, T0.Z, 0.0, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, PV.X, 0.0, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.X, KC0[3].W, literal.x, T0.W, -; EG-NEXT: AND_INT T3.Y, KC0[3].W, literal.y, -; EG-NEXT: CNDE_INT T2.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T1.W, PS, PV.Y, PV.Z, -; EG-NEXT: ASHR * T2.W, KC0[3].Z, literal.z, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: CNDE_INT T2.X, T5.W, PS, PV.W, +; EG-NEXT: ASHR T1.Y, KC0[3].Z, literal.x, +; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X, +; EG-NEXT: ASHR * T2.W, KC0[4].X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W, -; EG-NEXT: XOR_INT T1.Y, PV.W, PS, +; EG-NEXT: XOR_INT T2.Y, PV.W, PS, ; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: OR_INT T0.W, PV.Y, literal.y, -; EG-NEXT: SUB_INT * T1.W, literal.z, PV.X, -; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38) +; EG-NEXT: XOR_INT T1.W, PV.X, PV.Y, +; EG-NEXT: XOR_INT * T3.W, T3.Y, PV.Y, +; EG-NEXT: SUB_INT T3.Y, PS, T1.Y, +; EG-NEXT: SUBB_UINT T1.Z, PV.W, T1.Y, +; EG-NEXT: SUB_INT T3.W, PV.Z, T2.W, +; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT T4.Y, PV.W, PS, +; EG-NEXT: SUB_INT T0.Z, PV.Y, PV.Z, +; EG-NEXT: BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W, +; EG-NEXT: AND_INT * T4.W, KC0[3].Y, literal.y, +; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: ADD_INT T3.Y, PV.W, literal.x, +; EG-NEXT: OR_INT T1.Z, PS, literal.y, +; EG-NEXT: BFE_UINT T0.W, KC0[3].W, literal.z, T0.W, +; EG-NEXT: ADD_INT * T4.W, PV.W, literal.w, +; EG-NEXT: -127(nan), 8388608(1.175494e-38) +; EG-NEXT: 23(3.222986e-44), -150(nan) +; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x, +; EG-NEXT: ADD_INT T5.Y, PV.W, literal.y, +; EG-NEXT: SUB_INT T2.Z, literal.z, T3.W, +; EG-NEXT: NOT_INT T3.W, PS, +; EG-NEXT: LSHR * T5.W, PV.Z, 1, +; EG-NEXT: 8388607(1.175494e-38), -150(nan) ; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, KC0[3].Y, literal.x, -; EG-NEXT: AND_INT T3.Y, PS, literal.y, -; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS, -; EG-NEXT: SUB_INT T1.W, PV.Z, T2.W, -; EG-NEXT: SUBB_UINT * T3.W, PV.Y, T2.W, -; EG-NEXT: 8388607(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: SUB_INT T5.X, PV.W, PS, -; EG-NEXT: SETGT_INT T0.Y, 0.0, T0.Y, -; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0, -; EG-NEXT: OR_INT T1.W, PV.X, literal.x, -; EG-NEXT: ADD_INT * T3.W, T3.X, literal.y, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: ADD_INT T4.X, T3.X, literal.x, -; EG-NEXT: SUB_INT T3.Y, literal.y, T3.X, -; EG-NEXT: AND_INT T2.Z, PS, literal.z, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T4.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T3.W, PV.Y, literal.x, +; EG-NEXT: BIT_ALIGN_INT T2.X, 0.0, PS, PV.W, +; EG-NEXT: AND_INT T6.Y, PV.Z, literal.x, +; EG-NEXT: AND_INT T3.Z, PV.Y, literal.y, +; EG-NEXT: OR_INT T3.W, PV.X, literal.z, +; EG-NEXT: AND_INT * T5.W, T4.W, literal.y, +; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z, +; EG-NEXT: LSHL T7.Y, T1.Z, PS, +; EG-NEXT: AND_INT T1.Z, T4.W, literal.x, +; EG-NEXT: LSHL T4.W, PV.W, PV.Z, +; EG-NEXT: AND_INT * T5.W, T5.Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T6.X, T1.X, literal.x, -; EG-NEXT: CNDE_INT T3.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 108: -; EG-NEXT: CNDE_INT T1.W, T2.Z, T3.X, T4.Y, -; EG-NEXT: SETGT_INT * T3.W, T4.X, literal.x, +; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T8.Y, PV.Z, PV.Y, 0.0, +; EG-NEXT: CNDE_INT * T2.Z, T6.Y, PV.X, 0.0, +; EG-NEXT: ALU clause starting at 106: +; EG-NEXT: CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SETGT_INT * T7.W, T3.Y, literal.x, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.X, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T3.Y, PS, T3.Y, T3.Z, -; EG-NEXT: AND_INT T2.Z, T6.X, literal.x, -; EG-NEXT: NOT_INT T1.W, T6.X, -; EG-NEXT: LSHR * T3.W, T0.W, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T7.X, KC0[3].Y, literal.x, -; EG-NEXT: ADD_INT T4.Y, T1.X, literal.y, -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T0.W, PV.Z, -; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, +; EG-NEXT: CNDE_INT T1.X, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T6.Y, PS, T2.Z, T8.Y, +; EG-NEXT: SUB_INT T1.Z, literal.x, T0.W, +; EG-NEXT: NOT_INT T6.W, T5.Y, +; EG-NEXT: LSHR * T7.W, T3.W, 1, +; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) +; EG-NEXT: ASHR T2.X, KC0[3].Y, literal.x, +; EG-NEXT: ADD_INT T5.Y, T0.W, literal.y, +; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z, +; EG-NEXT: AND_INT * T3.W, PV.Z, literal.z, ; EG-NEXT: 31(4.344025e-44), -127(nan) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T5.Y, PS, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T2.Z, PV.Y, literal.x, -; EG-NEXT: XOR_INT T0.W, T3.Y, PV.X, -; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X, +; EG-NEXT: CNDE_INT T4.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T7.Y, T5.W, PV.Z, T4.W, +; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, T6.Y, PV.X, +; EG-NEXT: XOR_INT * T3.W, T1.X, PV.X, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T3.X, PS, T7.X, -; EG-NEXT: SUBB_UINT T3.Y, PV.W, T7.X, -; EG-NEXT: CNDE_INT T3.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.Z, PV.X, -; EG-NEXT: ASHR * T3.W, KC0[3].W, literal.x, +; EG-NEXT: SUB_INT T1.X, PS, T2.X, +; EG-NEXT: SUBB_UINT T6.Y, PV.W, T2.X, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T3.W, PV.Z, PV.X, T3.X, +; EG-NEXT: ASHR * T4.W, KC0[3].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: XOR_INT T1.X, PV.W, PS, -; EG-NEXT: XOR_INT T5.Y, PV.Z, PS, -; EG-NEXT: SUB_INT T0.Z, PV.X, PV.Y, -; EG-NEXT: SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T6.W, T0.Y, T5.X, 0.0, -; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: XOR_INT T3.X, PV.W, PS, +; EG-NEXT: XOR_INT T7.Y, PV.Z, PS, +; EG-NEXT: SUB_INT T1.Z, PV.X, PV.Y, +; EG-NEXT: SETGT_INT T3.W, 0.0, T3.Y, +; EG-NEXT: CNDE_INT * T6.W, T0.X, T0.Z, 0.0, +; EG-NEXT: SETGT_INT T1.X, 0.0, T0.Y, ; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0, -; EG-NEXT: SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T2.W, PV.Y, T3.W, -; EG-NEXT: SUBB_UINT * T4.W, PV.X, T3.W, -; EG-NEXT: SUB_INT T3.X, PV.W, PS, -; EG-NEXT: SETGT_INT T1.Y, 0.0, T4.Y, -; EG-NEXT: CNDE_INT T6.Z, T0.Y, PV.Z, 0.0, -; EG-NEXT: SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T4.W, PV.X, T2.X, 0.0, -; EG-NEXT: CNDE_INT T6.X, T1.W, PV.W, 0.0, -; EG-NEXT: CNDE_INT T4.Y, PV.Y, PV.X, 0.0, -; EG-NEXT: SUB_INT T0.W, T1.Z, T2.Y, -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T1.W, PV.Y, T4.W, +; EG-NEXT: SUBB_UINT * T5.W, PV.X, T4.W, +; EG-NEXT: SUB_INT T4.X, PV.W, PS, +; EG-NEXT: SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T6.Z, T0.X, PV.Z, 0.0, +; EG-NEXT: SUB_INT T0.W, T0.W, T2.X, +; EG-NEXT: CNDE_INT * T1.W, PV.X, T4.Y, 0.0, +; EG-NEXT: CNDE_INT T6.X, T3.W, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.X, 0.0, +; EG-NEXT: SUB_INT T0.W, T2.Y, T2.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T4.Z, T0.X, PV.W, 0.0, -; EG-NEXT: SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT T4.X, T1.Y, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Z, T1.X, PV.W, 0.0, +; EG-NEXT: SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT T1.X, T0.Y, PV.W, 0.0, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T2.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %conv = fptosi <4 x float> %x to <4 x i64> store <4 x i64> %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 5170f9c76db2..5abf82aa1aab 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -200,25 +200,25 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x ; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y, ; EG-NEXT: 8388608(1.175494e-38), -150(nan) ; EG-NEXT: ADD_INT T0.X, T0.W, literal.x, -; EG-NEXT: SUB_INT T0.Y, literal.y, T0.W, -; EG-NEXT: AND_INT T0.Z, PS, literal.z, +; EG-NEXT: AND_INT T0.Y, PS, literal.y, +; EG-NEXT: SUB_INT T0.Z, literal.z, T0.W, ; EG-NEXT: NOT_INT T0.W, PS, ; EG-NEXT: LSHR * T3.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) ; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T1.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT T1.Y, PV.Z, literal.x, +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z, +; EG-NEXT: LSHL T0.W, T1.W, PV.Y, +; EG-NEXT: AND_INT * T1.W, T2.W, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: CNDE_INT T0.W, PV.Z, PV.X, PV.Y, +; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0, +; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W, ; EG-NEXT: SETGT_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T0.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T0.W, PS, PV.Y, PV.Z, +; EG-NEXT: CNDE_INT T1.Z, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T0.W, PS, PV.Z, PV.Y, ; EG-NEXT: ASHR * T1.W, KC0[2].Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.W, PV.W, PS, @@ -288,79 +288,78 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 75, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 74, @4, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W, -; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: BFE_UINT T0.W, KC0[3].X, literal.y, T0.W, -; EG-NEXT: ADD_INT * T2.W, PV.W, literal.z, -; EG-NEXT: 8388607(1.175494e-38), 23(3.222986e-44) +; EG-NEXT: BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T0.W, KC0[2].W, literal.x, PV.W, +; EG-NEXT: AND_INT * T1.Z, KC0[2].W, literal.y, +; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: ADD_INT T1.W, PV.W, literal.x, +; EG-NEXT: ADD_INT * T2.W, PV.Z, literal.x, ; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: SUB_INT T0.X, literal.x, PV.W, -; EG-NEXT: SUB_INT T0.Y, literal.x, T1.W, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: OR_INT T3.W, PV.Z, literal.z, +; EG-NEXT: AND_INT T0.X, PS, literal.x, +; EG-NEXT: AND_INT T0.Y, PV.W, literal.x, +; EG-NEXT: OR_INT T1.Z, T1.Z, literal.y, +; EG-NEXT: SUB_INT T3.W, literal.z, T0.W, ; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w, -; EG-NEXT: 150(2.101948e-43), 31(4.344025e-44) -; EG-NEXT: 8388608(1.175494e-38), 8388607(1.175494e-38) +; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38) +; EG-NEXT: 150(2.101948e-43), 8388607(1.175494e-38) ; EG-NEXT: OR_INT T1.X, PS, literal.x, -; EG-NEXT: LSHL T1.Y, PV.W, PV.Z, -; EG-NEXT: AND_INT T0.Z, T2.W, literal.y, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y, -; EG-NEXT: AND_INT * T5.W, PV.Y, literal.y, +; EG-NEXT: AND_INT T1.Y, PV.W, literal.y, +; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W, +; EG-NEXT: LSHL T3.W, PV.Z, PV.Y, +; EG-NEXT: AND_INT * T4.W, T1.W, literal.y, ; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: CNDE_INT T2.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, 0.0, -; EG-NEXT: ADD_INT T1.Z, T0.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X, -; EG-NEXT: AND_INT * T5.W, T0.X, literal.y, -; EG-NEXT: -150(nan), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T2.Z, PV.Y, PV.Z, 0.0, +; EG-NEXT: LSHL T5.W, PV.X, T0.X, +; EG-NEXT: AND_INT * T6.W, T2.W, literal.x, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0, -; EG-NEXT: NOT_INT T2.Y, T2.W, -; EG-NEXT: AND_INT T2.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T2.W, PV.Z, -; EG-NEXT: LSHR * T4.W, T1.X, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T3.X, T3.W, 1, -; EG-NEXT: ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T1.X, PV.Z, -; EG-NEXT: AND_INT * T2.W, T1.Z, literal.y, +; EG-NEXT: NOT_INT T1.Y, T1.W, +; EG-NEXT: SUB_INT T3.Z, literal.x, T0.Z, +; EG-NEXT: NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T2.W, T1.X, 1, +; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, T1.Z, 1, +; EG-NEXT: ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z, +; EG-NEXT: AND_INT * T2.W, PV.Z, literal.y, ; EG-NEXT: -127(nan), 32(4.484155e-44) ; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y, -; EG-NEXT: ADD_INT * T1.W, T1.W, literal.y, +; EG-NEXT: CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGT_INT T0.Z, PV.Y, literal.x, +; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y, +; EG-NEXT: ADD_INT * T0.W, T0.W, literal.y, ; EG-NEXT: 23(3.222986e-44), -127(nan) -; EG-NEXT: CNDE_INT T3.X, T0.Z, PV.W, T1.Y, +; EG-NEXT: CNDE_INT T2.X, T4.W, PV.W, T3.W, ; EG-NEXT: SETGT_INT T1.Y, PS, literal.x, -; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T0.W, PV.Z, T0.X, PV.X, +; EG-NEXT: CNDE_INT T1.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T0.X, ; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y, ; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44) ; EG-NEXT: XOR_INT T0.X, PV.W, PS, -; EG-NEXT: XOR_INT T2.Y, PV.Z, PS, +; EG-NEXT: XOR_INT T3.Y, PV.Z, PS, ; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X, -; EG-NEXT: CNDE_INT T0.W, PV.Y, T2.X, T0.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Y, T2.Z, T0.Y, ; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: XOR_INT T0.Y, PV.W, PS, ; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: SUB_INT T0.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W, ; EG-NEXT: SUB_INT T1.Y, PV.W, PS, -; EG-NEXT: SETGT_INT T1.Z, 0.0, T3.Y, -; EG-NEXT: SUB_INT T0.W, PV.Z, T3.W, +; EG-NEXT: SETGT_INT T1.Z, 0.0, T2.Y, +; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W, ; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W, ; EG-NEXT: SUB_INT T0.Z, PV.W, PS, -; EG-NEXT: SETGT_INT T0.W, 0.0, T1.W, +; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W, ; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0, ; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0, ; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W, @@ -449,170 +448,168 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; ; EG-LABEL: fp_to_uint_v4f32_to_v4i64: ; EG: ; %bb.0: -; EG-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: ALU 54, @108, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1 +; EG-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 54, @106, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 6: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, KC0[4].X, literal.x, PV.W, -; EG-NEXT: AND_INT * T2.W, KC0[4].X, literal.y, +; EG-NEXT: BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, KC0[3].Z, literal.y, ; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) -; EG-NEXT: OR_INT T0.Z, PS, literal.x, -; EG-NEXT: BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W, -; EG-NEXT: ADD_INT * T3.W, PV.W, literal.z, -; EG-NEXT: 8388608(1.175494e-38), 23(3.222986e-44) -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.Y, PV.W, literal.x, -; EG-NEXT: AND_INT T1.Z, PS, literal.y, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.Z, 1, -; EG-NEXT: -127(nan), 31(4.344025e-44) +; EG-NEXT: OR_INT T2.W, PS, literal.x, +; EG-NEXT: ADD_INT * T3.W, PV.W, literal.y, +; EG-NEXT: 8388608(1.175494e-38), -150(nan) ; EG-NEXT: ADD_INT T0.X, T1.W, literal.x, -; EG-NEXT: BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.y, BS:VEC_201 -; EG-NEXT: LSHL T3.W, T0.Z, PV.Z, -; EG-NEXT: SUB_INT * T1.W, literal.z, T1.W, -; EG-NEXT: -127(nan), 32(4.484155e-44) -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS, -; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.Y, PV.W, -; EG-NEXT: SETGT_INT * T4.W, PV.X, literal.z, +; EG-NEXT: BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W, +; EG-NEXT: AND_INT T0.Z, PS, literal.z, +; EG-NEXT: NOT_INT T4.W, PS, +; EG-NEXT: LSHR * T5.W, PV.W, 1, +; EG-NEXT: -127(nan), 23(3.222986e-44) +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W, +; EG-NEXT: AND_INT T1.Y, T3.W, literal.x, +; EG-NEXT: LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T3.W, KC0[4].X, literal.y, +; EG-NEXT: ADD_INT * T4.W, PV.Y, literal.z, ; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38) +; EG-NEXT: -150(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Y, PS, literal.x, +; EG-NEXT: OR_INT T1.Z, PV.W, literal.y, +; EG-NEXT: CNDE_INT T3.W, PV.Y, PV.X, PV.Z, +; EG-NEXT: SETGT_INT * T5.W, T0.X, literal.z, +; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T2.X, PS, 0.0, PV.W, -; EG-NEXT: OR_INT T1.Y, PV.Z, literal.x, -; EG-NEXT: ADD_INT T0.Z, T2.W, literal.y, -; EG-NEXT: CNDE_INT T1.W, PV.X, PV.Y, 0.0, -; EG-NEXT: CNDE_INT * T3.W, T2.Z, T3.W, 0.0, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: CNDE_INT T1.X, T4.W, PV.W, PS, -; EG-NEXT: ASHR T2.Y, KC0[4].X, literal.x, -; EG-NEXT: AND_INT T1.Z, PV.Z, literal.x, -; EG-NEXT: NOT_INT T1.W, PV.Z, -; EG-NEXT: LSHR * T3.W, PV.Y, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T3.Y, T1.Y, PV.Z, -; EG-NEXT: XOR_INT T1.Z, PV.X, PV.Y, -; EG-NEXT: XOR_INT T1.W, T2.X, PV.Y, -; EG-NEXT: SUB_INT * T2.W, literal.x, T2.W, -; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.X, T0.Z, literal.x, -; EG-NEXT: AND_INT T4.Y, PS, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T1.W, PV.W, T2.Y, -; EG-NEXT: SUBB_UINT * T2.W, PV.Z, T2.Y, -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T2.X, PV.W, PS, -; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.Z, 0.0, -; EG-NEXT: CNDE_INT T0.Z, PV.X, T3.Y, 0.0, -; EG-NEXT: CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122 -; EG-NEXT: SETGT_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: CNDE_INT T3.Y, PS, 0.0, PV.W, +; EG-NEXT: SUB_INT T2.Z, literal.x, T1.W, +; EG-NEXT: LSHL T1.W, PV.Z, PV.Y, +; EG-NEXT: AND_INT * T3.W, T4.W, literal.y, +; EG-NEXT: 150(2.101948e-43), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, +; EG-NEXT: AND_INT T2.Y, PV.Z, literal.x, +; EG-NEXT: SUB_INT T3.Z, literal.y, T0.Y, +; EG-NEXT: NOT_INT T4.W, T4.W, +; EG-NEXT: LSHR * T6.W, T1.Z, 1, +; EG-NEXT: 32(4.484155e-44), 150(2.101948e-43) +; EG-NEXT: BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z, +; EG-NEXT: ADD_INT T0.Y, T0.Y, literal.x, +; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z, +; EG-NEXT: AND_INT * T4.W, PV.Z, literal.y, +; EG-NEXT: -127(nan), 32(4.484155e-44) +; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T4.Y, T3.W, PV.Z, T1.W, +; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: CNDE_INT T1.W, T1.Y, T0.Z, 0.0, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, PV.X, 0.0, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.X, KC0[3].W, literal.x, T0.W, -; EG-NEXT: AND_INT T3.Y, KC0[3].W, literal.y, -; EG-NEXT: CNDE_INT T2.Z, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T1.W, PS, PV.Y, PV.Z, -; EG-NEXT: ASHR * T2.W, KC0[3].Z, literal.z, -; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: CNDE_INT T2.X, T5.W, PS, PV.W, +; EG-NEXT: ASHR T1.Y, KC0[3].Z, literal.x, +; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X, +; EG-NEXT: ASHR * T2.W, KC0[4].X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W, -; EG-NEXT: XOR_INT T1.Y, PV.W, PS, +; EG-NEXT: XOR_INT T2.Y, PV.W, PS, ; EG-NEXT: XOR_INT T0.Z, PV.Z, PS, -; EG-NEXT: OR_INT T0.W, PV.Y, literal.y, -; EG-NEXT: SUB_INT * T1.W, literal.z, PV.X, -; EG-NEXT: 23(3.222986e-44), 8388608(1.175494e-38) +; EG-NEXT: XOR_INT T1.W, PV.X, PV.Y, +; EG-NEXT: XOR_INT * T3.W, T3.Y, PV.Y, +; EG-NEXT: SUB_INT T3.Y, PS, T1.Y, +; EG-NEXT: SUBB_UINT T1.Z, PV.W, T1.Y, +; EG-NEXT: SUB_INT T3.W, PV.Z, T2.W, +; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT T4.Y, PV.W, PS, +; EG-NEXT: SUB_INT T0.Z, PV.Y, PV.Z, +; EG-NEXT: BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W, +; EG-NEXT: AND_INT * T4.W, KC0[3].Y, literal.y, +; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38) +; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: ADD_INT T3.Y, PV.W, literal.x, +; EG-NEXT: OR_INT T1.Z, PS, literal.y, +; EG-NEXT: BFE_UINT T0.W, KC0[3].W, literal.z, T0.W, +; EG-NEXT: ADD_INT * T4.W, PV.W, literal.w, +; EG-NEXT: -127(nan), 8388608(1.175494e-38) +; EG-NEXT: 23(3.222986e-44), -150(nan) +; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x, +; EG-NEXT: ADD_INT T5.Y, PV.W, literal.y, +; EG-NEXT: SUB_INT T2.Z, literal.z, T3.W, +; EG-NEXT: NOT_INT T3.W, PS, +; EG-NEXT: LSHR * T5.W, PV.Z, 1, +; EG-NEXT: 8388607(1.175494e-38), -150(nan) ; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, KC0[3].Y, literal.x, -; EG-NEXT: AND_INT T3.Y, PS, literal.y, -; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS, -; EG-NEXT: SUB_INT T1.W, PV.Z, T2.W, -; EG-NEXT: SUBB_UINT * T3.W, PV.Y, T2.W, -; EG-NEXT: 8388607(1.175494e-38), 32(4.484155e-44) -; EG-NEXT: SUB_INT T5.X, PV.W, PS, -; EG-NEXT: SETGT_INT T0.Y, 0.0, T0.Y, -; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0, -; EG-NEXT: OR_INT T1.W, PV.X, literal.x, -; EG-NEXT: ADD_INT * T3.W, T3.X, literal.y, -; EG-NEXT: 8388608(1.175494e-38), -150(nan) -; EG-NEXT: ADD_INT T4.X, T3.X, literal.x, -; EG-NEXT: SUB_INT T3.Y, literal.y, T3.X, -; EG-NEXT: AND_INT T2.Z, PS, literal.z, -; EG-NEXT: NOT_INT T4.W, PS, -; EG-NEXT: LSHR * T5.W, PV.W, 1, -; EG-NEXT: -127(nan), 150(2.101948e-43) -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.X, 0.0, PS, PV.W, -; EG-NEXT: LSHL T4.Y, T1.W, PV.Z, -; EG-NEXT: AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122 -; EG-NEXT: AND_INT * T3.W, PV.Y, literal.x, +; EG-NEXT: BIT_ALIGN_INT T2.X, 0.0, PS, PV.W, +; EG-NEXT: AND_INT T6.Y, PV.Z, literal.x, +; EG-NEXT: AND_INT T3.Z, PV.Y, literal.y, +; EG-NEXT: OR_INT T3.W, PV.X, literal.z, +; EG-NEXT: AND_INT * T5.W, T4.W, literal.y, +; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z, +; EG-NEXT: LSHL T7.Y, T1.Z, PS, +; EG-NEXT: AND_INT T1.Z, T4.W, literal.x, +; EG-NEXT: LSHL T4.W, PV.W, PV.Z, +; EG-NEXT: AND_INT * T5.W, T5.Y, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T6.X, T1.X, literal.x, -; EG-NEXT: CNDE_INT T3.Y, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0, -; EG-NEXT: -150(nan), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 108: -; EG-NEXT: CNDE_INT T1.W, T2.Z, T3.X, T4.Y, -; EG-NEXT: SETGT_INT * T3.W, T4.X, literal.x, +; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T8.Y, PV.Z, PV.Y, 0.0, +; EG-NEXT: CNDE_INT * T2.Z, T6.Y, PV.X, 0.0, +; EG-NEXT: ALU clause starting at 106: +; EG-NEXT: CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SETGT_INT * T7.W, T3.Y, literal.x, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.X, PS, 0.0, PV.W, -; EG-NEXT: CNDE_INT T3.Y, PS, T3.Y, T3.Z, -; EG-NEXT: AND_INT T2.Z, T6.X, literal.x, -; EG-NEXT: NOT_INT T1.W, T6.X, -; EG-NEXT: LSHR * T3.W, T0.W, 1, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T7.X, KC0[3].Y, literal.x, -; EG-NEXT: ADD_INT T4.Y, T1.X, literal.y, -; EG-NEXT: BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W, -; EG-NEXT: LSHL T0.W, T0.W, PV.Z, -; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, +; EG-NEXT: CNDE_INT T1.X, PS, 0.0, PV.W, +; EG-NEXT: CNDE_INT T6.Y, PS, T2.Z, T8.Y, +; EG-NEXT: SUB_INT T1.Z, literal.x, T0.W, +; EG-NEXT: NOT_INT T6.W, T5.Y, +; EG-NEXT: LSHR * T7.W, T3.W, 1, +; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00) +; EG-NEXT: ASHR T2.X, KC0[3].Y, literal.x, +; EG-NEXT: ADD_INT T5.Y, T0.W, literal.y, +; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z, +; EG-NEXT: AND_INT * T3.W, PV.Z, literal.z, ; EG-NEXT: 31(4.344025e-44), -127(nan) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0, -; EG-NEXT: CNDE_INT T5.Y, PS, PV.Z, PV.W, -; EG-NEXT: SETGT_INT T2.Z, PV.Y, literal.x, -; EG-NEXT: XOR_INT T0.W, T3.Y, PV.X, -; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X, +; EG-NEXT: CNDE_INT T4.X, PS, PV.W, 0.0, +; EG-NEXT: CNDE_INT T7.Y, T5.W, PV.Z, T4.W, +; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, T6.Y, PV.X, +; EG-NEXT: XOR_INT * T3.W, T1.X, PV.X, ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: SUB_INT T3.X, PS, T7.X, -; EG-NEXT: SUBB_UINT T3.Y, PV.W, T7.X, -; EG-NEXT: CNDE_INT T3.Z, PV.Z, 0.0, PV.Y, -; EG-NEXT: CNDE_INT T1.W, PV.Z, T0.Z, PV.X, -; EG-NEXT: ASHR * T3.W, KC0[3].W, literal.x, +; EG-NEXT: SUB_INT T1.X, PS, T2.X, +; EG-NEXT: SUBB_UINT T6.Y, PV.W, T2.X, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y, +; EG-NEXT: CNDE_INT T3.W, PV.Z, PV.X, T3.X, +; EG-NEXT: ASHR * T4.W, KC0[3].W, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: XOR_INT T1.X, PV.W, PS, -; EG-NEXT: XOR_INT T5.Y, PV.Z, PS, -; EG-NEXT: SUB_INT T0.Z, PV.X, PV.Y, -; EG-NEXT: SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T6.W, T0.Y, T5.X, 0.0, -; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X, +; EG-NEXT: XOR_INT T3.X, PV.W, PS, +; EG-NEXT: XOR_INT T7.Y, PV.Z, PS, +; EG-NEXT: SUB_INT T1.Z, PV.X, PV.Y, +; EG-NEXT: SETGT_INT T3.W, 0.0, T3.Y, +; EG-NEXT: CNDE_INT * T6.W, T0.X, T0.Z, 0.0, +; EG-NEXT: SETGT_INT T1.X, 0.0, T0.Y, ; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0, -; EG-NEXT: SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122 -; EG-NEXT: SUB_INT T2.W, PV.Y, T3.W, -; EG-NEXT: SUBB_UINT * T4.W, PV.X, T3.W, -; EG-NEXT: SUB_INT T3.X, PV.W, PS, -; EG-NEXT: SETGT_INT T1.Y, 0.0, T4.Y, -; EG-NEXT: CNDE_INT T6.Z, T0.Y, PV.Z, 0.0, -; EG-NEXT: SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122 -; EG-NEXT: CNDE_INT * T4.W, PV.X, T2.X, 0.0, -; EG-NEXT: CNDE_INT T6.X, T1.W, PV.W, 0.0, -; EG-NEXT: CNDE_INT T4.Y, PV.Y, PV.X, 0.0, -; EG-NEXT: SUB_INT T0.W, T1.Z, T2.Y, -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T1.W, PV.Y, T4.W, +; EG-NEXT: SUBB_UINT * T5.W, PV.X, T4.W, +; EG-NEXT: SUB_INT T4.X, PV.W, PS, +; EG-NEXT: SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T6.Z, T0.X, PV.Z, 0.0, +; EG-NEXT: SUB_INT T0.W, T0.W, T2.X, +; EG-NEXT: CNDE_INT * T1.W, PV.X, T4.Y, 0.0, +; EG-NEXT: CNDE_INT T6.X, T3.W, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.X, 0.0, +; EG-NEXT: SUB_INT T0.W, T2.Y, T2.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T4.Z, T0.X, PV.W, 0.0, -; EG-NEXT: SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT T4.X, T1.Y, PV.W, 0.0, +; EG-NEXT: CNDE_INT T1.Z, T1.X, PV.W, 0.0, +; EG-NEXT: SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT T1.X, T0.Y, PV.W, 0.0, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T2.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %conv = fptoui <4 x float> %x to <4 x i64> store <4 x i64> %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 7a0450761e1f..3a867879bb80 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -228,23 +228,23 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; R600-NEXT: MUL_IEEE * T2.W, PS, literal.z, ; R600-NEXT: -127(nan), 254(3.559298e-43) ; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T3.X, T1.X, literal.x, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.y, +; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T0.Y, T1.X, literal.y, ; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T0.Z, ; R600-NEXT: CNDE_INT T3.W, PV.Y, PV.X, T0.X, ; R600-NEXT: SETGT_INT * T4.W, T0.Z, literal.z, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) ; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) ; R600-NEXT: CNDE_INT T0.Z, PS, PV.Z, PV.W, -; R600-NEXT: CNDE_INT T0.W, T0.W, PV.Y, T2.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.X, literal.x, +; R600-NEXT: MUL_IEEE T3.W, PV.Y, literal.x, +; R600-NEXT: CNDE_INT * T0.W, T0.W, PV.X, T2.W, ; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T1.Z, T1.Y, T3.X, PS, -; R600-NEXT: CNDE_INT T0.W, T1.W, PV.W, T1.X, +; R600-NEXT: CNDE_INT T1.Z, T1.W, PS, T1.X, +; R600-NEXT: CNDE_INT T0.W, T1.Y, T0.Y, PV.W, ; R600-NEXT: LSHL * T1.W, PV.Z, literal.x, ; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) ; R600-NEXT: ADD_INT T1.W, PS, literal.x, -; R600-NEXT: CNDE_INT * T0.W, T4.W, PV.W, PV.Z, +; R600-NEXT: CNDE_INT * T0.W, T4.W, PV.Z, PV.W, ; R600-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) ; R600-NEXT: MUL_IEEE T0.W, PS, PV.W, ; R600-NEXT: SETGT * T1.W, literal.x, KC0[2].Z, @@ -258,65 +258,63 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; CM-LABEL: s_exp_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 64, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 62, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, ; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, ; CM-NEXT: ADD * T1.W, KC0[2].Z, -PV.W, -; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: RNDNE * T2.W, PV.Z, -; CM-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; CM-NEXT: TRUNC T2.Z, PV.W, +; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, +; CM-NEXT: MUL_IEEE * T2.W, T0.W, literal.y, +; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; CM-NEXT: RNDNE T1.Z, PV.W, ; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z, ; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; CM-NEXT: MULADD_IEEE T0.Y, T0.W, literal.x, PV.W, -; CM-NEXT: ADD T0.Z, T0.Z, -T2.W, -; CM-NEXT: FLT_TO_INT * T0.W, PV.Z, +; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.x, PV.W, +; CM-NEXT: ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212 ; CM-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; CM-NEXT: MIN_INT T1.Z, PV.W, literal.x, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, +; CM-NEXT: TRUNC T1.Z, T1.Z, +; CM-NEXT: ADD * T0.W, PV.W, PV.Z, +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: FLT_TO_INT T0.Z, T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; CM-NEXT: MAX_INT T1.Z, PV.Z, literal.y, +; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) ; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T0.X, T1.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: MUL_IEEE T0.Y, PV.X, literal.x, -; CM-NEXT: ADD_INT T0.Z, T1.Z, literal.y, -; CM-NEXT: MAX_INT * T1.W, T0.W, literal.z, -; CM-NEXT: 2130706432(1.701412e+38), -254(nan) -; CM-NEXT: -330(nan), 0(0.000000e+00) -; CM-NEXT: ADD_INT T1.X, T0.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T1.Z, T0.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, T0.W, literal.w, -; CM-NEXT: -127(nan), 204(2.858649e-43) +; CM-NEXT: ADD_INT T1.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T1.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T1.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T1.W, T0.Z, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) ; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: SETGT_UINT T2.X, T0.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T1.Z, T0.W, literal.y, -; CM-NEXT: MUL_IEEE * T2.W, T0.X, literal.z, -; CM-NEXT: 254(3.559298e-43), -127(nan) -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.Z, PV.Y, T0.W, -; CM-NEXT: CNDE_INT T0.Z, PV.X, T1.X, T0.Z, -; CM-NEXT: SETGT_INT * T0.W, T0.W, literal.y, -; CM-NEXT: 209715200(1.972152e-31), 127(1.779649e-43) +; CM-NEXT: ADD_INT T2.X, T0.Z, literal.x, +; CM-NEXT: SETGT_UINT T2.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: MUL_IEEE T3.X, T0.X, literal.x, +; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, T0.Z, +; CM-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, T1.X, +; CM-NEXT: SETGT_INT * T3.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 127(1.779649e-43) ; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: CNDE_INT T0.Z, T1.W, PV.X, T2.W, -; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.x, +; CM-NEXT: CNDE_INT * T0.W, T1.W, T0.Y, T0.W, ; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.Y, T2.X, T0.Y, PV.W, -; CM-NEXT: CNDE_INT T0.Z, T1.Z, PV.Z, T0.X, -; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, +; CM-NEXT: CNDE_INT T0.Y, T2.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T0.Z, T2.Y, T3.X, PV.Z, +; CM-NEXT: LSHL * T0.W, PV.Y, literal.x, ; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) ; CM-NEXT: ADD_INT T1.Z, PV.W, literal.x, -; CM-NEXT: CNDE_INT * T0.W, T0.W, PV.Z, PV.Y, +; CM-NEXT: CNDE_INT * T0.W, T3.W, PV.Y, PV.Z, ; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) ; CM-NEXT: MUL_IEEE T0.Z, PV.W, PV.Z, ; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, @@ -610,105 +608,105 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; R600-NEXT: AND_INT * T0.W, KC0[3].X, literal.x, ; R600-NEXT: -4096(nan), 0(0.000000e+00) ; R600-NEXT: ADD * T1.W, KC0[3].X, -PV.W, -; R600-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; R600-NEXT: MUL_IEEE T2.W, PV.W, literal.y, -; R600-NEXT: MUL_IEEE * T3.W, T0.W, literal.z, -; R600-NEXT: -4096(nan), 967029397(3.122284e-04) -; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T1.Z, PS, +; R600-NEXT: MUL_IEEE T2.W, PV.W, literal.x, +; R600-NEXT: MUL_IEEE * T3.W, T0.W, literal.y, +; R600-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; R600-NEXT: RNDNE T0.Z, PS, ; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W, -; R600-NEXT: ADD * T2.W, KC0[2].W, -PV.Z, -; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: MUL_IEEE T2.Z, T0.Z, literal.y, +; R600-NEXT: AND_INT * T2.W, KC0[2].W, literal.y, +; R600-NEXT: 1069064192(1.442383e+00), -4096(nan) +; R600-NEXT: ADD T1.Z, KC0[2].W, -PS, ; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PV.W, ; R600-NEXT: ADD * T1.W, T3.W, -PV.Z, +; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) +; R600-NEXT: ADD T2.Z, PS, PV.W, +; R600-NEXT: MUL_IEEE T0.W, PV.Z, literal.x, +; R600-NEXT: MUL_IEEE * T1.W, T2.W, literal.y, ; R600-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) -; R600-NEXT: ADD T3.Z, PS, PV.W, -; R600-NEXT: RNDNE T0.W, PV.Z, -; R600-NEXT: MULADD_IEEE * T1.W, T2.W, literal.x, PV.Y, BS:VEC_021/SCL_122 -; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: TRUNC T0.Y, T1.Z, -; R600-NEXT: MULADD_IEEE T0.Z, T0.Z, literal.x, PS, BS:VEC_120/SCL_212 -; R600-NEXT: ADD T1.W, T2.Z, -PV.W, BS:VEC_201 +; R600-NEXT: RNDNE T0.Y, PS, +; R600-NEXT: MULADD_IEEE T1.Z, T1.Z, literal.x, PV.W, +; R600-NEXT: TRUNC T0.W, T0.Z, BS:VEC_120/SCL_212 ; R600-NEXT: EXP_IEEE * T0.X, PV.Z, -; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: ADD T0.Z, PV.W, PV.Z, -; R600-NEXT: FLT_TO_INT T1.W, PV.Y, -; R600-NEXT: MUL_IEEE * T2.W, PS, literal.x, -; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T1.Z, PS, literal.x, -; R600-NEXT: SETGT_UINT T3.W, PV.W, literal.y, -; R600-NEXT: EXP_IEEE * T0.Y, PV.Z, -; R600-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; R600-NEXT: CNDE_INT T1.X, PV.W, T2.W, PV.Z, -; R600-NEXT: MUL_IEEE T1.Y, PS, literal.x, -; R600-NEXT: MAX_INT T0.Z, T1.W, literal.y, -; R600-NEXT: MIN_INT T2.W, T1.W, literal.z, -; R600-NEXT: TRUNC * T0.W, T0.W, +; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) +; R600-NEXT: FLT_TO_INT T1.Y, PV.W, +; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, +; R600-NEXT: MULADD_IEEE T0.W, T2.W, literal.y, PV.Z, +; R600-NEXT: ADD * T1.W, T1.W, -PV.Y, +; R600-NEXT: 209715200(1.972152e-31), 967029397(3.122284e-04) +; R600-NEXT: ADD T1.Z, PS, PV.W, +; R600-NEXT: MUL_IEEE T0.W, PV.Z, literal.x, +; R600-NEXT: SETGT_UINT * T1.W, PV.Y, literal.y, +; R600-NEXT: 209715200(1.972152e-31), -229(nan) +; R600-NEXT: CNDE_INT T0.Z, PS, PV.W, T0.Z, +; R600-NEXT: SETGT_INT T0.W, T1.Y, literal.x, +; R600-NEXT: EXP_IEEE * T1.X, PV.Z, +; R600-NEXT: -127(nan), 0(0.000000e+00) +; R600-NEXT: CNDE_INT T0.Z, PV.W, PV.Z, T0.X, +; R600-NEXT: MAX_INT T2.W, T1.Y, literal.x, +; R600-NEXT: MUL_IEEE * T3.W, PS, literal.y, +; R600-NEXT: -330(nan), 209715200(1.972152e-31) +; R600-NEXT: MUL_IEEE T2.X, PS, literal.x, +; R600-NEXT: ADD_INT T2.Y, PV.W, literal.y, +; R600-NEXT: ADD_INT T1.Z, T1.Y, literal.z, +; R600-NEXT: MIN_INT T2.W, T1.Y, literal.w, +; R600-NEXT: TRUNC * T4.W, T0.Y, +; R600-NEXT: 209715200(1.972152e-31), 204(2.858649e-43) +; R600-NEXT: 102(1.429324e-43), 381(5.338947e-43) +; R600-NEXT: FLT_TO_INT T3.X, PS, +; R600-NEXT: ADD_INT T0.Y, PV.W, literal.x, +; R600-NEXT: ADD_INT T2.Z, T1.Y, literal.y, +; R600-NEXT: SETGT_UINT T2.W, T1.Y, literal.z, +; R600-NEXT: CNDE_INT * T1.W, T1.W, PV.Y, PV.Z, +; R600-NEXT: -254(nan), -127(nan) +; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T4.X, T1.X, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: CNDE_INT T1.Z, T0.W, PS, T1.Y, +; R600-NEXT: CNDE_INT T0.W, PV.W, PV.Z, PV.Y, +; R600-NEXT: MAX_INT * T1.W, PV.X, literal.y, ; R600-NEXT: 2130706432(1.701412e+38), -330(nan) -; R600-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; R600-NEXT: FLT_TO_INT T2.X, PS, -; R600-NEXT: ADD_INT T2.Y, PV.W, literal.x, -; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, -; R600-NEXT: ADD_INT T0.W, T1.W, literal.z, -; R600-NEXT: SETGT_UINT * T2.W, T1.W, literal.w, -; R600-NEXT: -254(nan), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: ADD_INT T3.X, T1.W, literal.x, -; R600-NEXT: CNDE_INT T3.Y, PS, PV.Z, PV.W, -; R600-NEXT: SETGT_INT T0.Z, T1.W, literal.x, -; R600-NEXT: MUL_IEEE T0.W, T0.X, literal.y, -; R600-NEXT: MUL_IEEE * T4.W, T0.Y, literal.y, -; R600-NEXT: -127(nan), 209715200(1.972152e-31) -; R600-NEXT: MUL_IEEE T4.X, PS, literal.x, -; R600-NEXT: MUL_IEEE T4.Y, PV.W, literal.x, -; R600-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, T1.W, -; R600-NEXT: CNDE_INT T3.W, T3.W, PV.X, T2.Y, -; R600-NEXT: MAX_INT * T5.W, T2.X, literal.y, -; R600-NEXT: 209715200(1.972152e-31), -330(nan) -; R600-NEXT: SETGT_INT T3.X, T1.W, literal.x, -; R600-NEXT: ADD_INT T2.Y, PS, literal.y, -; R600-NEXT: ADD_INT T2.Z, T2.X, literal.z, -; R600-NEXT: SETGT_UINT * T1.W, T2.X, literal.w, +; R600-NEXT: SETGT_INT T0.X, T1.Y, literal.x, +; R600-NEXT: ADD_INT T0.Y, PS, literal.y, +; R600-NEXT: ADD_INT T2.Z, T3.X, literal.z, +; R600-NEXT: SETGT_UINT * T1.W, T3.X, literal.w, ; R600-NEXT: 127(1.779649e-43), 204(2.858649e-43) ; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: MIN_INT * T5.W, T2.X, literal.x, +; R600-NEXT: MIN_INT * T4.W, T3.X, literal.x, ; R600-NEXT: 381(5.338947e-43), 0(0.000000e+00) ; R600-NEXT: ADD_INT T5.X, PV.W, literal.x, -; R600-NEXT: ADD_INT T3.Y, T2.X, literal.y, -; R600-NEXT: SETGT_UINT T3.Z, T2.X, literal.z, -; R600-NEXT: CNDE_INT T5.W, T1.W, T2.Y, T2.Z, -; R600-NEXT: SETGT_INT * T6.W, T2.X, literal.y, +; R600-NEXT: ADD_INT T1.Y, T3.X, literal.y, +; R600-NEXT: SETGT_UINT T3.Z, T3.X, literal.z, +; R600-NEXT: CNDE_INT T4.W, T1.W, T0.Y, T2.Z, +; R600-NEXT: SETGT_INT * T5.W, T3.X, literal.y, ; R600-NEXT: -254(nan), -127(nan) ; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T6.X, PS, PV.W, T2.X, -; R600-NEXT: CNDE_INT T2.Y, PV.Z, PV.Y, PV.X, -; R600-NEXT: SETGT_INT T2.Z, T2.X, literal.x, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T3.W, T3.X, T1.Z, T3.W, BS:VEC_021/SCL_122 -; R600-NEXT: CNDE_INT * T0.W, T2.W, T4.Y, T0.W, -; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T0.X, T0.Z, PS, T0.X, -; R600-NEXT: LSHL T3.Y, PV.W, literal.x, -; R600-NEXT: CNDE_INT T0.Z, PV.Z, PV.X, PV.Y, -; R600-NEXT: CNDE_INT T0.W, T1.W, T4.X, T4.W, -; R600-NEXT: MUL_IEEE * T1.W, T1.Y, literal.y, +; R600-NEXT: CNDE_INT T6.X, PS, PV.W, T3.X, +; R600-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, PV.X, +; R600-NEXT: SETGT_INT T2.Z, T3.X, literal.x, +; R600-NEXT: CNDE_INT T0.W, T0.X, T1.Z, T0.W, BS:VEC_120/SCL_212 +; R600-NEXT: MUL_IEEE * T4.W, T2.Y, literal.y, +; R600-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T3.X, T2.W, T2.Y, PS, BS:VEC_120/SCL_212 +; R600-NEXT: LSHL T1.Y, PV.W, literal.x, +; R600-NEXT: CNDE_INT T1.Z, PV.Z, PV.X, PV.Y, +; R600-NEXT: MUL_IEEE T0.W, T4.X, literal.y, +; R600-NEXT: CNDE_INT * T1.W, T1.W, T2.X, T3.W, ; R600-NEXT: 23(3.222986e-44), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T2.X, T3.Z, T1.Y, PS, -; R600-NEXT: CNDE_INT T0.Y, T6.W, PV.W, T0.Y, -; R600-NEXT: LSHL T0.Z, PV.Z, literal.x, +; R600-NEXT: CNDE_INT T1.X, T5.W, PS, T1.X, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T0.Y, T3.Z, T4.X, PV.W, BS:VEC_201 +; R600-NEXT: LSHL T1.Z, PV.Z, literal.x, ; R600-NEXT: ADD_INT T0.W, PV.Y, literal.y, -; R600-NEXT: CNDE_INT * T1.W, T3.X, PV.X, T1.X, +; R600-NEXT: CNDE_INT * T1.W, T0.X, T0.Z, PV.X, ; R600-NEXT: 23(3.222986e-44), 1065353216(1.000000e+00) ; R600-NEXT: MUL_IEEE T1.Y, PS, PV.W, -; R600-NEXT: SETGT T1.Z, literal.x, KC0[3].X, +; R600-NEXT: SETGT T0.Z, literal.x, KC0[3].X, ; R600-NEXT: ADD_INT * T0.W, PV.Z, literal.y, ; R600-NEXT: -1026650416(-1.032789e+02), 1065353216(1.000000e+00) ; R600-NEXT: ALU clause starting at 101: -; R600-NEXT: CNDE_INT * T1.W, T2.Z, T0.Y, T2.X, +; R600-NEXT: CNDE_INT * T1.W, T2.Z, T1.X, T0.Y, ; R600-NEXT: MUL_IEEE T0.Y, PV.W, T0.W, -; R600-NEXT: SETGT T0.Z, literal.x, KC0[2].W, -; R600-NEXT: CNDE T0.W, T1.Z, T1.Y, 0.0, +; R600-NEXT: SETGT T1.Z, literal.x, KC0[2].W, +; R600-NEXT: CNDE T0.W, T0.Z, T1.Y, 0.0, ; R600-NEXT: SETGT * T1.W, KC0[3].X, literal.y, ; R600-NEXT: -1026650416(-1.032789e+02), 1118925336(8.872284e+01) ; R600-NEXT: CNDE T1.Y, PS, PV.W, literal.x, @@ -721,118 +719,116 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; CM-LABEL: s_exp_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 100, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 18, @105, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 98, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 18, @103, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, ; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, ; CM-NEXT: ADD * T1.W, KC0[2].W, -PV.W, +; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, T0.W, literal.y, +; CM-NEXT: AND_INT * T2.W, KC0[3].X, literal.z, +; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; CM-NEXT: -4096(nan), 0(0.000000e+00) +; CM-NEXT: ADD T1.Y, KC0[3].X, -PV.W, +; CM-NEXT: RNDNE T1.Z, PV.Z, +; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.Y, ; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: RNDNE * T2.W, PV.Z, -; CM-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; CM-NEXT: TRUNC T0.Y, PV.W, -; CM-NEXT: AND_INT T2.Z, KC0[3].X, literal.x, -; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.y, PV.Z, -; CM-NEXT: -4096(nan), 1069064192(1.442383e+00) ; CM-NEXT: MULADD_IEEE T0.X, T0.W, literal.x, PV.W, -; CM-NEXT: MUL_IEEE T1.Y, PV.Z, literal.y, -; CM-NEXT: FLT_TO_INT T1.Z, PV.Y, -; CM-NEXT: ADD * T0.W, KC0[3].X, -PV.Z, +; CM-NEXT: ADD T0.Y, T0.Z, -PV.Z, +; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212 ; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) -; CM-NEXT: ADD T1.X, T0.Z, -T2.W, -; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, -; CM-NEXT: MAX_INT T0.Z, PV.Z, literal.y, -; CM-NEXT: RNDNE * T1.W, PV.Y, -; CM-NEXT: 967029397(3.122284e-04), -330(nan) -; CM-NEXT: TRUNC T2.X, PV.W, -; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.x, -; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.y, PV.Y, -; CM-NEXT: ADD * T0.W, PV.X, T0.X, -; CM-NEXT: 204(2.858649e-43), 1069064192(1.442383e+00) -; CM-NEXT: EXP_IEEE T0.X, T0.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, -; CM-NEXT: ADD_INT T1.X, T1.Z, literal.x, -; CM-NEXT: MULADD_IEEE T0.Y, T2.Z, literal.y, T0.Z, BS:VEC_102/SCL_221 -; CM-NEXT: ADD T0.Z, T1.Y, -T1.W, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.z, -; CM-NEXT: 102(1.429324e-43), 967029397(3.122284e-04) -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_UINT T3.X, T1.Z, literal.x, -; CM-NEXT: MUL_IEEE T1.Y, PV.W, literal.y, -; CM-NEXT: SETGT_UINT T2.Z, T1.Z, literal.z, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, -; CM-NEXT: -229(nan), 2130706432(1.701412e+38) -; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: TRUNC T1.X, T1.Z, +; CM-NEXT: RNDNE T2.Y, PV.W, +; CM-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z, +; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T1.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: MULADD_IEEE T2.X, T2.W, literal.x, T0.Z, +; CM-NEXT: ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212 +; CM-NEXT: FLT_TO_INT T0.Z, T1.X, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, +; CM-NEXT: 967029397(3.122284e-04), 209715200(1.972152e-31) +; CM-NEXT: MUL_IEEE T1.X, PV.W, literal.x, +; CM-NEXT: SETGT_UINT T1.Y, PV.Z, literal.y, +; CM-NEXT: TRUNC T1.Z, T2.Y, +; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) ; CM-NEXT: EXP_IEEE T0.X (MASKED), T1.W, ; CM-NEXT: EXP_IEEE T0.Y, T1.W, ; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, ; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: CNDE_INT T4.X, T2.Z, T0.W, T1.Y, -; CM-NEXT: CNDE_INT T1.Y, T3.X, T2.Y, T1.X, -; CM-NEXT: FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212 -; CM-NEXT: MUL_IEEE * T0.W, PV.Y, literal.x, -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_INT T1.X, T1.Z, literal.x, -; CM-NEXT: MUL_IEEE T2.Y, T0.X, literal.y, -; CM-NEXT: MUL_IEEE T3.Z, PV.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, PV.Z, literal.w, -; CM-NEXT: -127(nan), 209715200(1.972152e-31) -; CM-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T2.X, PV.W, T0.W, PV.Z, +; CM-NEXT: FLT_TO_INT T2.X, T1.Z, +; CM-NEXT: MUL_IEEE T2.Y, PV.Y, literal.x, +; CM-NEXT: CNDE_INT T1.Z, T1.Y, T1.X, T0.W, +; CM-NEXT: SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 209715200(1.972152e-31), -127(nan) +; CM-NEXT: CNDE_INT T1.X, PV.W, PV.Z, T0.X, ; CM-NEXT: MUL_IEEE T3.Y, PV.Y, literal.x, -; CM-NEXT: CNDE_INT T3.Z, PV.X, T1.Y, T1.Z, -; CM-NEXT: MAX_INT * T0.W, T0.Z, literal.y, -; CM-NEXT: 209715200(1.972152e-31), -330(nan) -; CM-NEXT: ADD_INT T5.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, T0.Z, literal.y, -; CM-NEXT: SETGT_UINT T4.Z, T0.Z, literal.z, -; CM-NEXT: MUL_IEEE * T0.W, T0.Y, literal.w, +; CM-NEXT: SETGT_UINT T1.Z, PV.X, literal.y, +; CM-NEXT: MAX_INT * T1.W, T0.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) +; CM-NEXT: -330(nan), 0(0.000000e+00) +; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T4.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T2.Z, PV.Z, PV.Y, T2.Y, +; CM-NEXT: SETGT_INT * T1.W, T2.X, literal.z, ; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) -; CM-NEXT: -229(nan), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T6.X, PV.W, literal.x, -; CM-NEXT: MIN_INT T4.Y, T0.Z, literal.y, -; CM-NEXT: CNDE_INT T5.Z, PV.Z, PV.X, PV.Y, -; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.z, -; CM-NEXT: 209715200(1.972152e-31), 381(5.338947e-43) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T5.X, PV.W, PV.Z, T0.Z, -; CM-NEXT: MIN_INT T1.Y, T1.Z, literal.x, -; CM-NEXT: ADD_INT T5.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T3.W, T0.Z, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 381(5.338947e-43), -254(nan) ; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T7.X, T1.W, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T4.Y, T0.Z, literal.x, -; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T1.W, T1.Z, literal.z, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT T4.X, PV.W, PV.Z, T0.Y, +; CM-NEXT: MUL_IEEE T2.Y, T0.X, literal.x, +; CM-NEXT: MAX_INT T2.Z, T2.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT * T2.W, T1.Y, PV.X, PV.Y, +; CM-NEXT: 2130706432(1.701412e+38), -330(nan) +; CM-NEXT: CNDE_INT T0.X, T0.W, PV.W, T0.Z, +; CM-NEXT: ADD_INT T1.Y, PV.Z, literal.x, +; CM-NEXT: ADD_INT T2.Z, T2.X, literal.y, +; CM-NEXT: MIN_INT * T0.W, T2.X, literal.z, +; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) +; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T3.Y, T2.X, literal.y, +; CM-NEXT: SETGT_UINT T3.Z, T2.X, literal.z, +; CM-NEXT: CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z, +; CM-NEXT: -254(nan), -127(nan) +; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T5.X, T0.Y, literal.x, +; CM-NEXT: CNDE_INT T0.Y, T1.W, PV.W, T2.X, +; CM-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, PV.X, +; CM-NEXT: MIN_INT * T0.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; CM-NEXT: SETGT_INT T2.X, T2.X, literal.x, +; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, +; CM-NEXT: ADD_INT T2.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T0.W, T0.Z, literal.w, ; CM-NEXT: 127(1.779649e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T8.X, T2.Z, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T5.X, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T4.Z, T6.X, T0.W, BS:VEC_201 -; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T5.X, T2.W, PV.W, T0.Y, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: CNDE_INT T3.X, PV.W, PV.Z, PV.Y, +; CM-NEXT: SETGT_INT T1.Y, T0.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, PV.X, T0.Y, T1.Z, +; CM-NEXT: MUL_IEEE * T1.W, T5.X, literal.y, +; CM-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T5.X, T3.Z, T5.X, PV.W, ; CM-NEXT: LSHL T0.Y, PV.Z, literal.x, -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T3.Z, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T3.X, T3.Y, T2.Y, BS:VEC_201 -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.X, T1.X, PV.W, T0.X, +; CM-NEXT: CNDE_INT T0.Z, PV.Y, T0.X, PV.X, BS:VEC_021/SCL_122 +; CM-NEXT: MUL_IEEE * T1.W, T2.Y, literal.y, +; CM-NEXT: 23(3.222986e-44), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T0.X, T0.W, T2.Y, PV.W, ; CM-NEXT: LSHL T2.Y, PV.Z, literal.x, ; CM-NEXT: ADD_INT * T0.Z, PV.Y, literal.y, ; CM-NEXT: 23(3.222986e-44), 1065353216(1.000000e+00) -; CM-NEXT: ALU clause starting at 105: -; CM-NEXT: CNDE_INT * T0.W, T4.Y, T5.X, T2.X, -; CM-NEXT: MUL_IEEE T1.X, PV.W, T0.Z, +; CM-NEXT: ALU clause starting at 103: +; CM-NEXT: CNDE_INT * T0.W, T2.X, T4.X, T5.X, +; CM-NEXT: MUL_IEEE T2.X, PV.W, T0.Z, ; CM-NEXT: SETGT T0.Y, literal.x, KC0[3].X, ; CM-NEXT: ADD_INT T0.Z, T2.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T1.Y, T0.X, T4.X, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT * T0.W, T1.Y, T1.X, T0.X, BS:VEC_120/SCL_212 ; CM-NEXT: -1026650416(-1.032789e+02), 1065353216(1.000000e+00) ; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, ; CM-NEXT: SETGT T1.Y, literal.x, KC0[2].W, @@ -1215,8 +1211,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; R600-LABEL: s_exp_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 100, @6, KC0[CB0:0-32], KC1[] -; R600-NEXT: ALU 69, @107, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 69, @106, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END @@ -1224,69 +1220,68 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; R600-NEXT: ALU clause starting at 6: ; R600-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x, ; R600-NEXT: -4096(nan), 0(0.000000e+00) -; R600-NEXT: ADD T1.W, KC0[3].Y, -PV.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.W, literal.x, +; R600-NEXT: MUL_IEEE T1.W, PV.W, literal.x, +; R600-NEXT: ADD * T2.W, KC0[3].Y, -PV.W, ; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T3.W, PS, -; R600-NEXT: MUL_IEEE * T4.W, PV.W, literal.x, +; R600-NEXT: RNDNE * T3.W, PV.W, +; R600-NEXT: TRUNC T4.W, PV.W, +; R600-NEXT: MUL_IEEE * T5.W, T2.W, literal.x, ; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PS, -; R600-NEXT: TRUNC * T4.W, PV.W, +; R600-NEXT: MULADD_IEEE T2.W, T2.W, literal.x, PS, +; R600-NEXT: FLT_TO_INT * T4.W, PV.W, ; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: FLT_TO_INT T0.Z, PS, -; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PV.W, -; R600-NEXT: ADD * T1.W, T2.W, -T3.W, -; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: ADD T0.W, PS, PV.W, -; R600-NEXT: MAX_INT * T1.W, PV.Z, literal.x, -; R600-NEXT: -330(nan), 0(0.000000e+00) -; R600-NEXT: ADD_INT T0.Y, PS, literal.x, -; R600-NEXT: ADD_INT T1.Z, T0.Z, literal.y, -; R600-NEXT: SETGT_UINT T1.W, T0.Z, literal.z, -; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: MAX_INT T0.Z, PS, literal.x, +; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.y, PV.W, +; R600-NEXT: ADD * T1.W, T1.W, -T3.W, +; R600-NEXT: -330(nan), 967029397(3.122284e-04) +; R600-NEXT: ADD T0.Y, PS, PV.W, +; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.x, +; R600-NEXT: ADD_INT T0.W, T4.W, literal.y, +; R600-NEXT: SETGT_UINT * T1.W, T4.W, literal.z, ; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) ; R600-NEXT: -229(nan), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT T0.W, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE * T2.W, PS, literal.y, -; R600-NEXT: -127(nan), 209715200(1.972152e-31) -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T0.Z, -; R600-NEXT: MIN_INT T3.W, T0.Z, literal.y, -; R600-NEXT: AND_INT * T4.W, KC0[3].W, literal.z, -; R600-NEXT: 209715200(1.972152e-31), 381(5.338947e-43) -; R600-NEXT: -4096(nan), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T1.X, T0.X, literal.x, -; R600-NEXT: ADD T1.Y, KC0[3].W, -PS, -; R600-NEXT: ADD_INT T2.Z, PV.W, literal.y, -; R600-NEXT: ADD_INT T3.W, T0.Z, literal.z, -; R600-NEXT: SETGT_UINT * T5.W, T0.Z, literal.w, -; R600-NEXT: 2130706432(1.701412e+38), -254(nan) +; R600-NEXT: CNDE_INT T0.Z, PS, PV.Z, PV.W, +; R600-NEXT: SETGT_INT T0.W, T4.W, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.Y, +; R600-NEXT: -127(nan), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, PS, literal.x, +; R600-NEXT: CNDE_INT T0.Y, PV.W, PV.Z, T4.W, +; R600-NEXT: MIN_INT T0.Z, T4.W, literal.y, +; R600-NEXT: AND_INT T2.W, KC0[3].W, literal.z, +; R600-NEXT: MUL_IEEE * T3.W, PS, literal.w, +; R600-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; R600-NEXT: -4096(nan), 209715200(1.972152e-31) +; R600-NEXT: MUL_IEEE T2.X, PS, literal.x, +; R600-NEXT: ADD T1.Y, KC0[3].W, -PV.W, +; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, +; R600-NEXT: ADD_INT T5.W, T4.W, literal.z, +; R600-NEXT: SETGT_UINT * T6.W, T4.W, literal.w, +; R600-NEXT: 209715200(1.972152e-31), -254(nan) ; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Z, -; R600-NEXT: SETGT_INT T2.Y, T0.Z, literal.x, +; R600-NEXT: CNDE_INT T3.X, PS, PV.W, PV.Z, +; R600-NEXT: SETGT_INT T2.Y, T4.W, literal.x, ; R600-NEXT: MUL_IEEE T0.Z, PV.Y, literal.y, -; R600-NEXT: MUL_IEEE T3.W, T4.W, literal.z, -; R600-NEXT: MUL_IEEE * T6.W, PV.X, literal.w, +; R600-NEXT: MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212 ; R600-NEXT: 127(1.779649e-43), 967029397(3.122284e-04) -; R600-NEXT: 1069064192(1.442383e+00), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T1.X, T5.W, T1.X, PS, BS:VEC_120/SCL_212 -; R600-NEXT: RNDNE T3.Y, PV.W, -; R600-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z, -; R600-NEXT: CNDE_INT T5.W, PV.Y, T1.Z, PV.X, -; R600-NEXT: CNDE_INT * T1.W, T1.W, T0.Y, T2.W, ; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T0.X, T0.W, PS, T0.X, +; R600-NEXT: CNDE_INT * T1.W, T1.W, T2.X, T3.W, +; R600-NEXT: CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122 +; R600-NEXT: RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212 +; R600-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z, +; R600-NEXT: CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212 +; R600-NEXT: MUL_IEEE * T1.W, T1.X, literal.y, +; R600-NEXT: 1069064192(1.442383e+00), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T1.X, T6.W, T1.X, PS, ; R600-NEXT: LSHL T0.Y, PV.W, literal.x, ; R600-NEXT: AND_INT T1.Z, KC0[3].Z, literal.y, -; R600-NEXT: MULADD_IEEE T0.W, T4.W, literal.z, PV.Z, BS:VEC_120/SCL_212 -; R600-NEXT: ADD * T1.W, T3.W, -PV.Y, +; R600-NEXT: MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212 +; R600-NEXT: ADD * T1.W, T4.W, -PV.Y, ; R600-NEXT: 23(3.222986e-44), -4096(nan) ; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) ; R600-NEXT: ADD T1.Y, PS, PV.W, ; R600-NEXT: MUL_IEEE T0.Z, PV.Z, literal.x, ; R600-NEXT: ADD_INT T0.W, PV.Y, literal.y, -; R600-NEXT: CNDE_INT * T1.W, T2.Y, PV.X, T1.X, +; R600-NEXT: CNDE_INT * T1.W, T2.Y, T0.X, PV.X, ; R600-NEXT: 1069064192(1.442383e+00), 1065353216(1.000000e+00) ; R600-NEXT: MUL_IEEE T0.X, PS, PV.W, ; R600-NEXT: ADD T0.Y, KC0[3].Z, -T1.Z, @@ -1300,12 +1295,12 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; R600-NEXT: MUL_IEEE * T1.W, PS, literal.z, ; R600-NEXT: -1026650416(-1.032789e+02), 967029397(3.122284e-04) ; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T3.X, T1.X, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, PS, literal.y, +; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, T1.X, literal.y, ; R600-NEXT: MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W, ; R600-NEXT: FLT_TO_INT T0.W, PV.Z, ; R600-NEXT: MIN_INT * T2.W, PV.Y, literal.w, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) ; R600-NEXT: 1069064192(1.442383e+00), 381(5.338947e-43) ; R600-NEXT: ADD_INT T4.X, PS, literal.x, ; R600-NEXT: MAX_INT T0.Y, PV.W, literal.y, @@ -1323,7 +1318,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; R600-NEXT: 102(1.429324e-43), -229(nan) ; R600-NEXT: ADD_INT * T6.X, T0.W, literal.x, ; R600-NEXT: -127(nan), 0(0.000000e+00) -; R600-NEXT: ALU clause starting at 107: +; R600-NEXT: ALU clause starting at 106: ; R600-NEXT: SETGT_UINT T0.Y, T0.W, literal.x, ; R600-NEXT: CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221 ; R600-NEXT: SETGT_INT T2.W, T0.W, literal.y, @@ -1339,25 +1334,25 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; R600-NEXT: SETGT_UINT T5.X, T1.Y, literal.x, ; R600-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W, ; R600-NEXT: MAX_INT T0.Z, T1.Y, literal.y, -; R600-NEXT: MUL_IEEE T4.W, T1.Z, literal.z, -; R600-NEXT: MUL_IEEE * T5.W, PV.Y, literal.w, +; R600-NEXT: MUL_IEEE T4.W, PV.Y, literal.z, +; R600-NEXT: MUL_IEEE * T5.W, T1.Z, literal.w, ; R600-NEXT: 254(3.559298e-43), -330(nan) -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) -; R600-NEXT: CNDE_INT T6.X, T3.W, PS, T3.Y, BS:VEC_021/SCL_122 -; R600-NEXT: MUL_IEEE T3.Y, PV.W, literal.x, +; R600-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) +; R600-NEXT: MUL_IEEE T6.X, PS, literal.x, +; R600-NEXT: CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122 ; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, ; R600-NEXT: ADD_INT T3.W, T1.Y, literal.z, -; R600-NEXT: SETGT_UINT * T5.W, T1.Y, literal.w, +; R600-NEXT: SETGT_UINT * T4.W, T1.Y, literal.w, ; R600-NEXT: 2130706432(1.701412e+38), 204(2.858649e-43) ; R600-NEXT: 102(1.429324e-43), -229(nan) ; R600-NEXT: CNDE_INT T8.X, PS, PV.Z, PV.W, ; R600-NEXT: SETGT_INT T5.Y, T1.Y, literal.x, -; R600-NEXT: CNDE_INT T0.Z, T0.Y, T4.W, PV.Y, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T2.W, T2.W, PV.X, T1.Z, +; R600-NEXT: CNDE_INT T0.Z, T2.W, PV.Y, T1.Z, +; R600-NEXT: CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212 ; R600-NEXT: LSHL * T3.W, T4.Y, literal.y, ; R600-NEXT: -127(nan), 23(3.222986e-44) ; R600-NEXT: ADD_INT T6.X, PS, literal.x, -; R600-NEXT: CNDE_INT T0.Y, T0.W, PV.W, PV.Z, +; R600-NEXT: CNDE_INT T0.Y, T0.W, PV.Z, PV.W, ; R600-NEXT: CNDE_INT T0.Z, PV.Y, PV.X, T1.Y, ; R600-NEXT: CNDE_INT T0.W, T5.X, T7.X, T4.X, ; R600-NEXT: SETGT_INT * T2.W, T1.Y, literal.y, @@ -1365,18 +1360,18 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; R600-NEXT: CNDE_INT T4.X, PS, PV.Z, PV.W, ; R600-NEXT: MUL_IEEE T0.Y, PV.Y, PV.X, ; R600-NEXT: SETGT T0.Z, literal.x, KC0[3].Z, -; R600-NEXT: CNDE_INT T0.W, T5.W, T2.Y, T1.W, -; R600-NEXT: MUL_IEEE * T1.W, T3.X, literal.y, +; R600-NEXT: MUL_IEEE T0.W, T2.Y, literal.y, +; R600-NEXT: CNDE_INT * T1.W, T4.W, T3.X, T1.W, ; R600-NEXT: -1026650416(-1.032789e+02), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T3.X, T5.X, T3.X, PS, -; R600-NEXT: CNDE_INT T1.Y, T5.Y, PV.W, T1.X, +; R600-NEXT: CNDE_INT T1.X, T5.Y, PS, T1.X, +; R600-NEXT: CNDE_INT T1.Y, T5.X, T2.Y, PV.W, ; R600-NEXT: CNDE T0.Z, PV.Z, PV.Y, 0.0, ; R600-NEXT: SETGT T0.W, KC0[3].Z, literal.x, ; R600-NEXT: LSHL * T1.W, PV.X, literal.y, ; R600-NEXT: 1118925336(8.872284e+01), 23(3.222986e-44) -; R600-NEXT: ADD_INT T1.X, PS, literal.x, +; R600-NEXT: ADD_INT T3.X, PS, literal.x, ; R600-NEXT: CNDE T0.Y, PV.W, PV.Z, literal.y, -; R600-NEXT: CNDE_INT T0.Z, T2.W, PV.Y, PV.X, +; R600-NEXT: CNDE_INT T0.Z, T2.W, PV.X, PV.Y, ; R600-NEXT: CNDE T0.W, T2.X, T0.X, 0.0, ; R600-NEXT: SETGT * T1.W, KC0[3].Y, literal.z, ; R600-NEXT: 1065353216(1.000000e+00), 2139095040(INF) @@ -1397,197 +1392,193 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; CM-LABEL: s_exp_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 102, @6, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 80, @109, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X +; CM-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 77, @108, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 6: ; CM-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x, ; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, ; CM-NEXT: ADD * T1.W, KC0[3].Y, -PV.W, -; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: RNDNE * T2.W, PV.Z, -; CM-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; CM-NEXT: TRUNC T2.Z, PV.W, +; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, +; CM-NEXT: MUL_IEEE * T2.W, T0.W, literal.y, +; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; CM-NEXT: RNDNE T1.Z, PV.W, ; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z, ; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; CM-NEXT: MULADD_IEEE T0.Y, T0.W, literal.x, PV.W, -; CM-NEXT: ADD T0.Z, T0.Z, -T2.W, -; CM-NEXT: FLT_TO_INT * T0.W, PV.Z, +; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.x, PV.W, +; CM-NEXT: ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212 ; CM-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; CM-NEXT: MIN_INT T1.Z, PV.W, literal.x, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, +; CM-NEXT: TRUNC T1.Z, T1.Z, +; CM-NEXT: ADD * T0.W, PV.W, PV.Z, +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: FLT_TO_INT T0.Z, T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; CM-NEXT: MAX_INT T1.Z, PV.Z, literal.y, +; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) ; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T0.X, T1.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: MUL_IEEE T0.Y, PV.X, literal.x, -; CM-NEXT: ADD_INT T0.Z, T1.Z, literal.y, -; CM-NEXT: MAX_INT * T1.W, T0.W, literal.z, -; CM-NEXT: 2130706432(1.701412e+38), -254(nan) -; CM-NEXT: -330(nan), 0(0.000000e+00) -; CM-NEXT: ADD_INT T1.X, T0.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T1.Z, T0.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, T0.W, literal.w, -; CM-NEXT: -127(nan), 204(2.858649e-43) +; CM-NEXT: ADD_INT T1.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T1.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T1.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T1.W, T0.Z, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) ; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: SETGT_UINT T2.X, T0.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T1.Z, T0.W, literal.y, -; CM-NEXT: MUL_IEEE * T2.W, T0.X, literal.z, -; CM-NEXT: 254(3.559298e-43), -127(nan) -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.Z, PV.Y, T0.W, -; CM-NEXT: CNDE_INT T0.Z, PV.X, T1.X, T0.Z, -; CM-NEXT: SETGT_INT * T0.W, T0.W, literal.y, -; CM-NEXT: 209715200(1.972152e-31), 127(1.779649e-43) +; CM-NEXT: ADD_INT T2.X, T0.Z, literal.x, +; CM-NEXT: SETGT_UINT T2.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: MUL_IEEE T3.X, T0.X, literal.x, +; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, T0.Z, +; CM-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, T1.X, +; CM-NEXT: SETGT_INT * T3.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 127(1.779649e-43) ; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: CNDE_INT T0.Z, T1.W, PV.X, T2.W, -; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.x, +; CM-NEXT: CNDE_INT * T0.W, T1.W, T0.Y, T0.W, ; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T1.X, T2.X, T0.Y, PV.W, -; CM-NEXT: CNDE_INT T0.Y, T1.Z, PV.Z, T0.X, +; CM-NEXT: CNDE_INT T0.X, T2.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T0.Y, T2.Y, T3.X, PV.Z, ; CM-NEXT: LSHL T0.Z, PV.Y, literal.x, -; CM-NEXT: AND_INT * T1.W, KC0[3].Z, literal.y, +; CM-NEXT: AND_INT * T0.W, KC0[3].Z, literal.y, ; CM-NEXT: 23(3.222986e-44), -4096(nan) -; CM-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; CM-NEXT: ADD T1.Y, KC0[3].Z, -PV.W, -; CM-NEXT: ADD_INT T0.Z, PV.Z, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T0.W, PV.Y, PV.X, -; CM-NEXT: 1069064192(1.442383e+00), 1065353216(1.000000e+00) -; CM-NEXT: MUL_IEEE T0.Y, PV.W, PV.Z, -; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, -; CM-NEXT: RNDNE * T0.W, PV.X, -; CM-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) +; CM-NEXT: ADD_INT T0.Z, PV.Z, literal.x, +; CM-NEXT: CNDE_INT * T1.W, T3.W, PV.X, PV.Y, +; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, +; CM-NEXT: MUL_IEEE T0.Y, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, T0.W, literal.y, +; CM-NEXT: AND_INT * T1.W, KC0[3].W, literal.z, +; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; CM-NEXT: -4096(nan), 0(0.000000e+00) ; CM-NEXT: SETGT T1.X, literal.x, KC0[3].Y, -; CM-NEXT: TRUNC T2.Y, PV.W, -; CM-NEXT: AND_INT T1.Z, KC0[3].W, literal.y, -; CM-NEXT: MULADD_IEEE * T2.W, T1.Y, literal.z, PV.Z, -; CM-NEXT: -1026650416(-1.032789e+02), -4096(nan) -; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; CM-NEXT: MULADD_IEEE T2.X, T1.W, literal.x, PV.W, -; CM-NEXT: MUL_IEEE T1.Y, PV.Z, literal.y, -; CM-NEXT: FLT_TO_INT T0.Z, PV.Y, -; CM-NEXT: ADD * T1.W, KC0[3].W, -PV.Z, +; CM-NEXT: ADD T2.Y, KC0[3].W, -PV.W, +; CM-NEXT: RNDNE T1.Z, PV.Z, +; CM-NEXT: MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y, +; CM-NEXT: -1026650416(-1.032789e+02), 1069064192(1.442383e+00) +; CM-NEXT: MULADD_IEEE T2.X, T0.W, literal.x, PV.W, +; CM-NEXT: ADD T0.Y, T0.Z, -PV.Z, +; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212 ; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) -; CM-NEXT: ADD T0.X, T0.X, -T0.W, -; CM-NEXT: MUL_IEEE T2.Y, PV.W, literal.x, -; CM-NEXT: MAX_INT T2.Z, PV.Z, literal.y, -; CM-NEXT: RNDNE * T0.W, PV.Y, -; CM-NEXT: 967029397(3.122284e-04), -330(nan) -; CM-NEXT: TRUNC T3.X, PV.W, -; CM-NEXT: ADD_INT T3.Y, PV.Z, literal.x, -; CM-NEXT: MULADD_IEEE T2.Z, T1.W, literal.y, PV.Y, -; CM-NEXT: ADD * T1.W, PV.X, T2.X, -; CM-NEXT: 204(2.858649e-43), 1069064192(1.442383e+00) -; CM-NEXT: EXP_IEEE T0.X, T1.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: ADD_INT T2.X, T0.Z, literal.x, -; CM-NEXT: MULADD_IEEE T2.Y, T1.Z, literal.y, T2.Z, BS:VEC_102/SCL_221 -; CM-NEXT: ADD T1.Z, T1.Y, -T0.W, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.z, -; CM-NEXT: 102(1.429324e-43), 967029397(3.122284e-04) -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_UINT T4.X, T0.Z, literal.x, -; CM-NEXT: MUL_IEEE T1.Y, PV.W, literal.y, -; CM-NEXT: SETGT_UINT T2.Z, T0.Z, literal.z, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, -; CM-NEXT: -229(nan), 2130706432(1.701412e+38) -; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: TRUNC T3.X, T1.Z, +; CM-NEXT: RNDNE T1.Y, PV.W, +; CM-NEXT: MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z, +; CM-NEXT: ADD * T2.W, PV.Y, PV.X, +; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T2.W, +; CM-NEXT: EXP_IEEE T0.Y, T2.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T2.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T2.W, +; CM-NEXT: MULADD_IEEE T2.X, T1.W, literal.x, T0.Z, +; CM-NEXT: ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212 +; CM-NEXT: FLT_TO_INT T0.Z, T3.X, +; CM-NEXT: MUL_IEEE * T0.W, PV.Y, literal.y, +; CM-NEXT: 967029397(3.122284e-04), 209715200(1.972152e-31) +; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, +; CM-NEXT: SETGT_UINT T3.Y, PV.Z, literal.y, +; CM-NEXT: TRUNC T1.Z, T1.Y, +; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) ; CM-NEXT: EXP_IEEE T1.X (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T1.Z, T1.W, +; CM-NEXT: EXP_IEEE T1.Y, T1.W, +; CM-NEXT: EXP_IEEE T1.Z (MASKED), T1.W, ; CM-NEXT: EXP_IEEE * T1.W (MASKED), T1.W, -; CM-NEXT: ALU clause starting at 109: -; CM-NEXT: CNDE_INT T5.X, T2.Z, T0.W, T1.Y, -; CM-NEXT: CNDE_INT T1.Y, T4.X, T3.Y, T2.X, -; CM-NEXT: FLT_TO_INT T3.Z, T3.X, BS:VEC_120/SCL_212 -; CM-NEXT: MUL_IEEE * T0.W, T1.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_INT T2.X, T0.Z, literal.x, -; CM-NEXT: MUL_IEEE T2.Y, T0.X, literal.y, -; CM-NEXT: MUL_IEEE T4.Z, PV.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, PV.Z, literal.w, -; CM-NEXT: -127(nan), 209715200(1.972152e-31) -; CM-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T3.X, PV.W, T0.W, PV.Z, -; CM-NEXT: MUL_IEEE T3.Y, PV.Y, literal.x, -; CM-NEXT: CNDE_INT T4.Z, PV.X, T1.Y, T0.Z, -; CM-NEXT: MAX_INT * T0.W, T3.Z, literal.y, -; CM-NEXT: 209715200(1.972152e-31), -330(nan) -; CM-NEXT: ADD_INT T6.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, T3.Z, literal.y, -; CM-NEXT: SETGT_UINT T5.Z, T3.Z, literal.z, -; CM-NEXT: MUL_IEEE * T0.W, T1.Z, literal.w, BS:VEC_120/SCL_212 +; CM-NEXT: FLT_TO_INT T2.X, T1.Z, +; CM-NEXT: MUL_IEEE T2.Y, PV.Y, literal.x, +; CM-NEXT: CNDE_INT T1.Z, T3.Y, T3.X, T0.W, +; CM-NEXT: SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 209715200(1.972152e-31), -127(nan) +; CM-NEXT: CNDE_INT T3.X, PV.W, PV.Z, T0.Y, +; CM-NEXT: MUL_IEEE * T4.Y, PV.Y, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: ALU clause starting at 108: +; CM-NEXT: SETGT_UINT T1.Z, T2.X, literal.x, +; CM-NEXT: MAX_INT * T1.W, T0.Z, literal.y, +; CM-NEXT: -229(nan), -330(nan) +; CM-NEXT: ADD_INT T4.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T5.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y, +; CM-NEXT: SETGT_INT * T1.W, T2.X, literal.z, ; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) -; CM-NEXT: -229(nan), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T7.X, PV.W, literal.x, -; CM-NEXT: MIN_INT T4.Y, T3.Z, literal.y, -; CM-NEXT: CNDE_INT T6.Z, PV.Z, PV.X, PV.Y, -; CM-NEXT: SETGT_INT * T2.W, T3.Z, literal.z, -; CM-NEXT: 209715200(1.972152e-31), 381(5.338947e-43) ; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T6.X, PV.W, PV.Z, T3.Z, -; CM-NEXT: MIN_INT T1.Y, T0.Z, literal.x, -; CM-NEXT: ADD_INT T6.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T3.W, T3.Z, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 381(5.338947e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T8.X, T1.W, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T4.Y, T3.Z, literal.x, -; CM-NEXT: ADD_INT T3.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T1.W, T0.Z, literal.z, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT T5.X, PV.W, PV.Z, T1.Y, +; CM-NEXT: MUL_IEEE T0.Y, T0.Y, literal.x, +; CM-NEXT: MAX_INT T2.Z, T2.X, literal.y, +; CM-NEXT: CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212 +; CM-NEXT: 2130706432(1.701412e+38), -330(nan) +; CM-NEXT: CNDE_INT T4.X, T0.W, PV.W, T0.Z, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.x, +; CM-NEXT: ADD_INT T2.Z, T2.X, literal.y, +; CM-NEXT: MIN_INT * T0.W, T2.X, literal.z, +; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) +; CM-NEXT: ADD_INT T6.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T3.Y, T2.X, literal.y, +; CM-NEXT: SETGT_UINT T3.Z, T2.X, literal.z, +; CM-NEXT: CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z, +; CM-NEXT: -254(nan), -127(nan) +; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T7.X, T1.Y, literal.x, +; CM-NEXT: CNDE_INT T1.Y, T1.W, PV.W, T2.X, +; CM-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, PV.X, +; CM-NEXT: MIN_INT * T0.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; CM-NEXT: SETGT_INT T2.X, T2.X, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.W, literal.y, +; CM-NEXT: ADD_INT T2.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T0.W, T0.Z, literal.w, ; CM-NEXT: 127(1.779649e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T9.X, T2.Z, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T1.Y, T0.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T6.X, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T5.Z, T7.X, T0.W, BS:VEC_201 -; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T6.X, T2.W, PV.W, T1.Z, -; CM-NEXT: LSHL T5.Y, PV.Z, literal.x, -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T4.Z, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T4.X, T3.Y, T2.Y, -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.X, T2.X, PV.W, T0.X, -; CM-NEXT: LSHL T2.Y, PV.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: CNDE_INT T6.X, PV.W, PV.Z, PV.Y, +; CM-NEXT: SETGT_INT T2.Y, T0.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, PV.X, T1.Y, T1.Z, +; CM-NEXT: MUL_IEEE * T1.W, T7.X, literal.y, +; CM-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T7.X, T3.Z, T7.X, PV.W, +; CM-NEXT: LSHL T1.Y, PV.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122 +; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.y, +; CM-NEXT: 23(3.222986e-44), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T4.X, T0.W, T0.Y, PV.W, +; CM-NEXT: LSHL T0.Y, PV.Z, literal.x, ; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T4.Y, PV.X, T3.X, BS:VEC_021/SCL_122 +; CM-NEXT: CNDE_INT * T0.W, T2.X, T5.X, PV.X, ; CM-NEXT: 23(3.222986e-44), 1065353216(1.000000e+00) ; CM-NEXT: MUL_IEEE T2.X, PV.W, PV.Z, -; CM-NEXT: SETGT T3.Y, literal.x, KC0[3].W, +; CM-NEXT: SETGT T1.Y, literal.x, KC0[3].W, ; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T1.Y, PV.X, T5.X, +; CM-NEXT: CNDE_INT * T0.W, T2.Y, T3.X, PV.X, ; CM-NEXT: -1026650416(-1.032789e+02), 1065353216(1.000000e+00) -; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, -; CM-NEXT: SETGT T1.Y, literal.x, KC0[3].Z, +; CM-NEXT: MUL_IEEE T3.X, PV.W, PV.Z, +; CM-NEXT: SETGT T0.Y, literal.x, KC0[3].Z, ; CM-NEXT: CNDE T0.Z, PV.Y, PV.X, 0.0, ; CM-NEXT: SETGT * T0.W, KC0[3].W, literal.y, ; CM-NEXT: -1026650416(-1.032789e+02), 1118925336(8.872284e+01) ; CM-NEXT: CNDE T2.X, PV.W, PV.Z, literal.x, -; CM-NEXT: CNDE T1.Y, PV.Y, PV.X, 0.0, +; CM-NEXT: CNDE T0.Y, PV.Y, PV.X, 0.0, ; CM-NEXT: SETGT T0.Z, KC0[3].Z, literal.y, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; CM-NEXT: 2139095040(INF), 1118925336(8.872284e+01) ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T0.X, PV.W, literal.x, -; CM-NEXT: CNDE T1.Y, PV.Z, PV.Y, literal.y, -; CM-NEXT: CNDE T0.Z, T1.X, T0.Y, 0.0, +; CM-NEXT: LSHR T3.X, PV.W, literal.x, +; CM-NEXT: CNDE T0.Y, PV.Z, PV.Y, literal.y, +; CM-NEXT: CNDE T0.Z, T1.X, T0.X, 0.0, ; CM-NEXT: SETGT * T0.W, KC0[3].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 2139095040(INF) ; CM-NEXT: 1118925336(8.872284e+01), 0(0.000000e+00) -; CM-NEXT: CNDE * T1.X, PV.W, PV.Z, literal.x, +; CM-NEXT: CNDE * T0.X, PV.W, PV.Z, literal.x, ; CM-NEXT: 2139095040(INF), 0(0.000000e+00) -; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %in) store <3 x float> %result, ptr addrspace(1) %out @@ -2050,227 +2041,224 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; R600-LABEL: s_exp_v4f32: ; R600: ; %bb.0: ; R600-NEXT: ALU 98, @6, KC0[CB0:0-32], KC1[] -; R600-NEXT: ALU 98, @105, KC0[CB0:0-32], KC1[] -; R600-NEXT: ALU 24, @204, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 95, @105, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 24, @201, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 6: ; R600-NEXT: AND_INT * T0.W, KC0[3].Z, literal.x, ; R600-NEXT: -4096(nan), 0(0.000000e+00) -; R600-NEXT: ADD T1.W, KC0[3].Z, -PV.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.W, literal.x, +; R600-NEXT: ADD * T1.W, KC0[3].Z, -PV.W, +; R600-NEXT: MUL_IEEE T2.W, PV.W, literal.x, +; R600-NEXT: MUL_IEEE * T3.W, T0.W, literal.y, +; R600-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; R600-NEXT: RNDNE T4.W, PS, +; R600-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122 ; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T3.W, PS, -; R600-NEXT: MUL_IEEE * T4.W, PV.W, literal.x, +; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PS, +; R600-NEXT: ADD * T1.W, T3.W, -PV.W, ; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PS, -; R600-NEXT: TRUNC * T4.W, PV.W, -; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: FLT_TO_INT T0.Z, PS, -; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PV.W, -; R600-NEXT: ADD * T1.W, T2.W, -T3.W, -; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: ADD T1.Z, PS, PV.W, -; R600-NEXT: MAX_INT T0.W, PV.Z, literal.x, -; R600-NEXT: MIN_INT * T1.W, PV.Z, literal.y, -; R600-NEXT: -330(nan), 381(5.338947e-43) -; R600-NEXT: ADD_INT T0.X, PS, literal.x, -; R600-NEXT: ADD_INT T0.Y, PV.W, literal.y, -; R600-NEXT: ADD_INT T2.Z, T0.Z, literal.z, -; R600-NEXT: SETGT_UINT T0.W, T0.Z, literal.w, -; R600-NEXT: EXP_IEEE * T1.X, PV.Z, -; R600-NEXT: -254(nan), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: ADD_INT T2.X, T0.Z, literal.x, -; R600-NEXT: SETGT_UINT T1.Y, T0.Z, literal.y, -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT T1.W, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE * T2.W, PS, literal.z, -; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T3.X, T1.X, literal.x, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.y, -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T0.Z, -; R600-NEXT: CNDE_INT T3.W, PV.Y, PV.X, T0.X, -; R600-NEXT: SETGT_INT * T4.W, T0.Z, literal.z, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) -; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; R600-NEXT: AND_INT T2.Y, KC0[4].X, literal.x, -; R600-NEXT: CNDE_INT T0.Z, PS, PV.Z, PV.W, -; R600-NEXT: CNDE_INT T0.W, T0.W, PV.Y, T2.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.X, literal.y, -; R600-NEXT: -4096(nan), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T0.X, T1.Y, T3.X, PS, -; R600-NEXT: CNDE_INT T0.Y, T1.W, PV.W, T1.X, -; R600-NEXT: LSHL T0.Z, PV.Z, literal.x, -; R600-NEXT: ADD T0.W, KC0[4].X, -PV.Y, -; R600-NEXT: MUL_IEEE * T1.W, PV.Y, literal.y, -; R600-NEXT: 23(3.222986e-44), 1069064192(1.442383e+00) -; R600-NEXT: RNDNE T1.Y, PS, -; R600-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; R600-NEXT: ADD_INT T2.W, PV.Z, literal.y, -; R600-NEXT: CNDE_INT * T3.W, T4.W, PV.Y, PV.X, -; R600-NEXT: 967029397(3.122284e-04), 1065353216(1.000000e+00) -; R600-NEXT: MUL_IEEE T0.Y, PS, PV.W, -; R600-NEXT: AND_INT T0.Z, KC0[3].W, literal.x, -; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.y, PV.Z, -; R600-NEXT: TRUNC * T2.W, PV.Y, -; R600-NEXT: -4096(nan), 1069064192(1.442383e+00) -; R600-NEXT: SETGT T0.X, literal.x, KC0[3].Z, -; R600-NEXT: FLT_TO_INT T3.Y, PS, -; R600-NEXT: MULADD_IEEE T1.Z, T2.Y, literal.y, PV.W, -; R600-NEXT: ADD T0.W, T1.W, -T1.Y, -; R600-NEXT: MUL_IEEE * T1.W, PV.Z, literal.z, -; R600-NEXT: -1026650416(-1.032789e+02), 967029397(3.122284e-04) -; R600-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T1.X, PS, -; R600-NEXT: AND_INT T1.Y, KC0[3].Y, literal.x, -; R600-NEXT: ADD T1.Z, PV.W, PV.Z, -; R600-NEXT: MAX_INT T0.W, PV.Y, literal.y, -; R600-NEXT: MIN_INT * T2.W, PV.Y, literal.z, -; R600-NEXT: -4096(nan), -330(nan) +; R600-NEXT: ADD T0.W, PS, PV.W, +; R600-NEXT: TRUNC * T1.W, T4.W, +; R600-NEXT: FLT_TO_INT T1.W, PS, +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, +; R600-NEXT: MAX_INT T0.W, PV.W, literal.y, +; R600-NEXT: MIN_INT * T2.W, PV.W, literal.z, +; R600-NEXT: 209715200(1.972152e-31), -330(nan) ; R600-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; R600-NEXT: ADD_INT T2.X, PS, literal.x, -; R600-NEXT: ADD_INT T2.Y, PV.W, literal.y, -; R600-NEXT: ADD_INT T2.Z, T3.Y, literal.z, -; R600-NEXT: SETGT_UINT T0.W, T3.Y, literal.w, -; R600-NEXT: EXP_IEEE * T1.Z, PV.Z, -; R600-NEXT: -254(nan), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: ADD_INT T3.X, T3.Y, literal.x, -; R600-NEXT: SETGT_UINT T4.Y, T3.Y, literal.y, -; R600-NEXT: CNDE_INT T2.Z, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT T2.W, T3.Y, literal.x, -; R600-NEXT: MUL_IEEE * T3.W, PS, literal.z, +; R600-NEXT: ADD_INT T1.X, PS, literal.x, +; R600-NEXT: AND_INT T0.Y, KC0[4].X, literal.y, +; R600-NEXT: ADD_INT T1.Z, PV.W, literal.z, +; R600-NEXT: ADD_INT * T0.W, T1.W, literal.w, +; R600-NEXT: -254(nan), -4096(nan) +; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; R600-NEXT: SETGT_UINT * T2.W, T1.W, literal.x, +; R600-NEXT: -229(nan), 0(0.000000e+00) +; R600-NEXT: ADD_INT T2.X, T1.W, literal.x, +; R600-NEXT: SETGT_UINT T1.Y, T1.W, literal.y, +; R600-NEXT: CNDE_INT T1.Z, PV.W, T1.Z, T0.W, +; R600-NEXT: SETGT_INT T0.W, T1.W, literal.x, +; R600-NEXT: ADD * T3.W, KC0[4].X, -T0.Y, ; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T4.X, T1.Z, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, PS, literal.y, -; R600-NEXT: CNDE_INT T2.Z, PV.W, PV.Z, T3.Y, -; R600-NEXT: CNDE_INT T4.W, PV.Y, PV.X, T2.X, -; R600-NEXT: SETGT_INT * T5.W, T3.Y, literal.z, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, T0.Y, literal.y, +; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T1.W, +; R600-NEXT: CNDE_INT T4.W, PV.Y, PV.X, T1.X, +; R600-NEXT: SETGT_INT * T1.W, T1.W, literal.z, +; R600-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) ; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; R600-NEXT: ADD T2.X, KC0[3].W, -T0.Z, -; R600-NEXT: CNDE_INT T3.Y, PS, PV.Z, PV.W, -; R600-NEXT: CNDE_INT * T2.Z, T0.W, PV.Y, T3.W, -; R600-NEXT: ALU clause starting at 105: -; R600-NEXT: MUL_IEEE T0.W, T4.X, literal.x, -; R600-NEXT: ADD * T3.W, KC0[3].Y, -T1.Y, +; R600-NEXT: CNDE_INT T1.X, PS, PV.Z, PV.W, +; R600-NEXT: RNDNE T3.Y, PV.Y, +; R600-NEXT: MULADD_IEEE T1.Z, T3.W, literal.x, PV.X, +; R600-NEXT: MUL_IEEE T3.W, T0.Z, literal.y, +; R600-NEXT: MUL_IEEE * T4.W, T0.X, literal.z, +; R600-NEXT: 1069064192(1.442383e+00), 209715200(1.972152e-31) ; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.X, PS, literal.x, +; R600-NEXT: CNDE_INT T4.Y, T2.W, PV.W, T0.Z, +; R600-NEXT: MULADD_IEEE T0.Z, T0.Y, literal.y, PV.Z, +; R600-NEXT: ADD T2.W, T2.Y, -PV.Y, BS:VEC_120/SCL_212 +; R600-NEXT: AND_INT * T3.W, KC0[3].Y, literal.z, +; R600-NEXT: 2130706432(1.701412e+38), 967029397(3.122284e-04) +; R600-NEXT: -4096(nan), 0(0.000000e+00) ; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, T1.Y, literal.y, -; R600-NEXT: CNDE_INT T3.Z, T4.Y, T4.X, PV.W, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T0.W, T2.W, T2.Z, T1.Z, -; R600-NEXT: LSHL * T2.W, T3.Y, literal.z, -; R600-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) -; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; R600-NEXT: ADD_INT T4.X, PS, literal.x, -; R600-NEXT: CNDE_INT T3.Y, T5.W, PV.W, PV.Z, -; R600-NEXT: RNDNE T1.Z, PV.Y, -; R600-NEXT: MULADD_IEEE T0.W, T3.W, literal.y, PV.X, BS:VEC_120/SCL_212 -; R600-NEXT: MUL_IEEE * T2.W, T2.X, literal.z, +; R600-NEXT: ADD T0.Y, PV.W, PV.Z, +; R600-NEXT: CNDE_INT T0.Z, T0.W, PV.Y, T0.X, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T0.W, T1.Y, T4.W, PV.X, +; R600-NEXT: LSHL * T2.W, T1.X, literal.y, +; R600-NEXT: 1069064192(1.442383e+00), 23(3.222986e-44) +; R600-NEXT: AND_INT T0.X, KC0[3].W, literal.x, +; R600-NEXT: TRUNC T1.Y, T3.Y, +; R600-NEXT: ADD_INT T1.Z, PS, literal.y, +; R600-NEXT: CNDE_INT T0.W, T1.W, PV.Z, PV.W, +; R600-NEXT: EXP_IEEE * T0.Y, PV.Y, +; R600-NEXT: -4096(nan), 1065353216(1.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, PV.W, PV.Z, +; R600-NEXT: FLT_TO_INT T1.Y, PV.Y, +; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, +; R600-NEXT: ADD T0.W, KC0[3].W, -PV.X, +; R600-NEXT: RNDNE * T1.W, T3.X, +; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; R600-NEXT: SETGT T2.X, literal.x, KC0[3].Z, +; R600-NEXT: TRUNC T2.Y, PS, +; R600-NEXT: MUL_IEEE T1.Z, PV.W, literal.y, +; R600-NEXT: MUL_IEEE T2.W, PV.Z, literal.z, +; R600-NEXT: MAX_INT * T4.W, PV.Y, literal.w, +; R600-NEXT: -1026650416(-1.032789e+02), 967029397(3.122284e-04) +; R600-NEXT: 209715200(1.972152e-31), -330(nan) +; R600-NEXT: ADD T4.X, KC0[3].Y, -T3.W, +; R600-NEXT: ADD_INT T3.Y, PS, literal.x, +; R600-NEXT: ADD_INT T2.Z, T1.Y, literal.y, +; R600-NEXT: SETGT_UINT T4.W, T1.Y, literal.z, +; R600-NEXT: MIN_INT * T5.W, T1.Y, literal.w, +; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; R600-NEXT: -229(nan), 381(5.338947e-43) +; R600-NEXT: ADD_INT T5.X, PS, literal.x, +; R600-NEXT: ADD_INT T4.Y, T1.Y, literal.y, +; R600-NEXT: SETGT_UINT T3.Z, T1.Y, literal.z, +; R600-NEXT: CNDE_INT T5.W, PV.W, PV.Y, PV.Z, +; R600-NEXT: SETGT_INT * T6.W, T1.Y, literal.y, +; R600-NEXT: -254(nan), -127(nan) +; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T6.X, T0.Y, literal.x, +; R600-NEXT: CNDE_INT T3.Y, PS, PV.W, T1.Y, +; R600-NEXT: CNDE_INT * T2.Z, PV.Z, PV.Y, PV.X, +; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; R600-NEXT: ALU clause starting at 105: +; R600-NEXT: SETGT_INT T5.W, T1.Y, literal.x, +; R600-NEXT: MUL_IEEE * T7.W, T4.X, literal.y, +; R600-NEXT: 127(1.779649e-43), 967029397(3.122284e-04) +; R600-NEXT: MUL_IEEE T5.X, T0.X, literal.x, +; R600-NEXT: MULADD_IEEE T1.Y, T4.X, literal.x, PS, BS:VEC_120/SCL_212 +; R600-NEXT: CNDE_INT T2.Z, PV.W, T3.Y, T2.Z, +; R600-NEXT: MUL_IEEE T7.W, T6.X, literal.y, BS:VEC_201 +; R600-NEXT: CNDE_INT * T2.W, T4.W, T2.W, T0.Z, +; R600-NEXT: 1069064192(1.442383e+00), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T4.X, T6.W, PS, T0.Y, +; R600-NEXT: CNDE_INT T0.Y, T3.Z, T6.X, PV.W, +; R600-NEXT: LSHL T0.Z, PV.Z, literal.x, +; R600-NEXT: MULADD_IEEE T2.W, T3.W, literal.y, PV.Y, BS:VEC_201 +; R600-NEXT: ADD * T1.W, T3.X, -T1.W, +; R600-NEXT: 23(3.222986e-44), 967029397(3.122284e-04) +; R600-NEXT: ADD T3.X, PS, PV.W, +; R600-NEXT: ADD_INT T1.Y, PV.Z, literal.x, +; R600-NEXT: CNDE_INT T0.Z, T5.W, PV.X, PV.Y, +; R600-NEXT: RNDNE T1.W, T5.X, +; R600-NEXT: MULADD_IEEE * T0.W, T0.W, literal.y, T1.Z, BS:VEC_021/SCL_122 ; R600-NEXT: 1065353216(1.000000e+00), 1069064192(1.442383e+00) -; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: MULADD_IEEE T2.X, T2.X, literal.x, PS, -; R600-NEXT: MULADD_IEEE T1.Y, T1.Y, literal.y, PV.W, -; R600-NEXT: ADD T2.Z, T2.Y, -PV.Z, BS:VEC_120/SCL_212 -; R600-NEXT: MUL_IEEE T0.W, PV.Y, PV.X, -; R600-NEXT: SETGT * T2.W, literal.z, KC0[4].X, -; R600-NEXT: 1069064192(1.442383e+00), 967029397(3.122284e-04) -; R600-NEXT: -1026650416(-1.032789e+02), 0(0.000000e+00) -; R600-NEXT: CNDE T3.X, PS, PV.W, 0.0, -; R600-NEXT: ADD T1.Y, PV.Z, PV.Y, -; R600-NEXT: TRUNC T1.Z, T1.Z, -; R600-NEXT: MULADD_IEEE T0.W, T0.Z, literal.x, PV.X, BS:VEC_120/SCL_212 -; R600-NEXT: ADD * T1.W, T1.W, -T1.X, -; R600-NEXT: 967029397(3.122284e-04), 0(0.000000e+00) -; R600-NEXT: SETGT T2.X, KC0[4].X, literal.x, -; R600-NEXT: ADD T2.Y, PS, PV.W, -; R600-NEXT: FLT_TO_INT T0.Z, PV.Z, -; R600-NEXT: TRUNC T0.W, T1.X, -; R600-NEXT: EXP_IEEE * T1.X, PV.Y, -; R600-NEXT: 1118925336(8.872284e+01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T4.X, PS, literal.x, -; R600-NEXT: FLT_TO_INT T1.Y, PV.W, -; R600-NEXT: MAX_INT T1.Z, PV.Z, literal.y, -; R600-NEXT: MUL_IEEE T0.W, PS, literal.z, -; R600-NEXT: EXP_IEEE * T1.W, PV.Y, -; R600-NEXT: 2130706432(1.701412e+38), -330(nan) +; R600-NEXT: MULADD_IEEE T0.X, T0.X, literal.x, PS, +; R600-NEXT: ADD T0.Y, T5.X, -PV.W, BS:VEC_120/SCL_212 +; R600-NEXT: MUL_IEEE T0.Z, PV.Z, PV.Y, +; R600-NEXT: SETGT T0.W, literal.y, KC0[4].X, +; R600-NEXT: EXP_IEEE * T1.Y, PV.X, +; R600-NEXT: 967029397(3.122284e-04), -1026650416(-1.032789e+02) +; R600-NEXT: CNDE T3.X, PV.W, PV.Z, 0.0, +; R600-NEXT: ADD T0.Y, PV.Y, PV.X, +; R600-NEXT: FLT_TO_INT T0.Z, T2.Y, +; R600-NEXT: TRUNC T0.W, T1.W, +; R600-NEXT: MUL_IEEE * T1.W, PS, literal.x, ; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T5.X, PV.W, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, PS, literal.x, -; R600-NEXT: ADD_INT T1.Z, PV.Z, literal.y, -; R600-NEXT: ADD_INT T2.W, T0.Z, literal.z, -; R600-NEXT: MAX_INT * T3.W, PV.Y, literal.w, -; R600-NEXT: 209715200(1.972152e-31), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -330(nan) -; R600-NEXT: SETGT_UINT T6.X, T0.Z, literal.x, -; R600-NEXT: ADD_INT T3.Y, PS, literal.y, -; R600-NEXT: ADD_INT T2.Z, T1.Y, literal.z, -; R600-NEXT: SETGT_UINT T3.W, T1.Y, literal.x, -; R600-NEXT: MIN_INT * T4.W, T1.Y, literal.w, +; R600-NEXT: SETGT T0.X, KC0[4].X, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, PS, literal.y, +; R600-NEXT: FLT_TO_INT T1.Z, PV.W, +; R600-NEXT: MAX_INT T0.W, PV.Z, literal.z, +; R600-NEXT: EXP_IEEE * T0.Y, PV.Y, +; R600-NEXT: 1118925336(8.872284e+01), 209715200(1.972152e-31) +; R600-NEXT: -330(nan), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T4.X, T1.Y, literal.x, +; R600-NEXT: MUL_IEEE T3.Y, PS, literal.y, +; R600-NEXT: ADD_INT T2.Z, PV.W, literal.z, +; R600-NEXT: ADD_INT * T0.W, T0.Z, literal.w, +; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; R600-NEXT: MAX_INT * T2.W, T1.Z, literal.x, +; R600-NEXT: -330(nan), 0(0.000000e+00) +; R600-NEXT: SETGT_UINT T5.X, T0.Z, literal.x, +; R600-NEXT: ADD_INT T4.Y, PV.W, literal.y, +; R600-NEXT: ADD_INT T3.Z, T1.Z, literal.z, BS:VEC_120/SCL_212 +; R600-NEXT: SETGT_UINT T2.W, T1.Z, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: MIN_INT * T3.W, T1.Z, literal.w, ; R600-NEXT: -229(nan), 204(2.858649e-43) ; R600-NEXT: 102(1.429324e-43), 381(5.338947e-43) -; R600-NEXT: ADD_INT T7.X, PS, literal.x, -; R600-NEXT: ADD_INT T4.Y, T1.Y, literal.y, -; R600-NEXT: SETGT_UINT T3.Z, T1.Y, literal.z, -; R600-NEXT: CNDE_INT T4.W, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT * T5.W, T1.Y, literal.y, +; R600-NEXT: ADD_INT T6.X, PS, literal.x, +; R600-NEXT: ADD_INT T5.Y, T1.Z, literal.y, +; R600-NEXT: SETGT_UINT T4.Z, T1.Z, literal.z, +; R600-NEXT: CNDE_INT T3.W, PV.W, PV.Y, PV.Z, +; R600-NEXT: SETGT_INT * T4.W, T1.Z, literal.y, ; R600-NEXT: -254(nan), -127(nan) ; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T8.X, PS, PV.W, T1.Y, -; R600-NEXT: CNDE_INT T3.Y, PV.Z, PV.Y, PV.X, -; R600-NEXT: SETGT_INT T2.Z, T1.Y, literal.x, -; R600-NEXT: CNDE_INT T2.W, T6.X, T1.Z, T2.W, -; R600-NEXT: SETGT_INT * T4.W, T0.Z, literal.y, +; R600-NEXT: CNDE_INT T7.X, PS, PV.W, T1.Z, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T4.Y, PV.Z, PV.Y, PV.X, +; R600-NEXT: SETGT_INT T1.Z, T1.Z, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: CNDE_INT T0.W, T5.X, T2.Z, T0.W, BS:VEC_102/SCL_221 +; R600-NEXT: SETGT_INT * T3.W, T0.Z, literal.y, ; R600-NEXT: 127(1.779649e-43), -127(nan) -; R600-NEXT: CNDE_INT T7.X, PS, PV.W, T0.Z, -; R600-NEXT: CNDE_INT T1.Y, PV.Z, PV.X, PV.Y, -; R600-NEXT: MIN_INT T1.Z, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE T2.W, T1.W, literal.y, -; R600-NEXT: MUL_IEEE * T6.W, T2.Y, literal.z, -; R600-NEXT: 381(5.338947e-43), 2130706432(1.701412e+38) -; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T8.X, T3.W, PS, T2.Y, -; R600-NEXT: MUL_IEEE T2.Y, PV.W, literal.x, -; R600-NEXT: ADD_INT T1.Z, PV.Z, literal.y, -; R600-NEXT: ADD_INT T3.W, T0.Z, literal.z, -; R600-NEXT: SETGT_UINT * T6.W, T0.Z, literal.w, +; R600-NEXT: CNDE_INT T6.X, PS, PV.W, T0.Z, +; R600-NEXT: CNDE_INT T4.Y, PV.Z, PV.X, PV.Y, +; R600-NEXT: MIN_INT T2.Z, T0.Z, literal.x, +; R600-NEXT: MUL_IEEE T0.W, T3.Y, literal.y, +; R600-NEXT: MUL_IEEE * T5.W, T0.Y, literal.z, +; R600-NEXT: 381(5.338947e-43), 209715200(1.972152e-31) +; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T7.X, PS, literal.x, +; R600-NEXT: CNDE_INT T3.Y, T2.W, PV.W, T3.Y, +; R600-NEXT: ADD_INT T2.Z, PV.Z, literal.y, +; R600-NEXT: ADD_INT T0.W, T0.Z, literal.z, +; R600-NEXT: SETGT_UINT * T2.W, T0.Z, literal.w, ; R600-NEXT: 2130706432(1.701412e+38), -254(nan) ; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: CNDE_INT T9.X, PS, PV.W, PV.Z, -; R600-NEXT: SETGT_INT T3.Y, T0.Z, literal.x, -; R600-NEXT: CNDE_INT T0.Z, T3.Z, T2.W, PV.Y, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T1.W, T5.W, PV.X, T1.W, BS:VEC_021/SCL_122 -; R600-NEXT: LSHL * T2.W, T1.Y, literal.y, +; R600-NEXT: CNDE_INT T8.X, PS, PV.W, PV.Z, +; R600-NEXT: SETGT_INT T5.Y, T0.Z, literal.x, +; R600-NEXT: CNDE_INT T0.Z, T4.W, PV.Y, T0.Y, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T0.W, T4.Z, T5.W, PV.X, BS:VEC_120/SCL_212 +; R600-NEXT: LSHL * T4.W, T4.Y, literal.y, ; R600-NEXT: 127(1.779649e-43), 23(3.222986e-44) -; R600-NEXT: ADD_INT T8.X, PS, literal.x, -; R600-NEXT: CNDE_INT T1.Y, T2.Z, PV.W, PV.Z, -; R600-NEXT: CNDE_INT T0.Z, PV.Y, T7.X, PV.X, -; R600-NEXT: CNDE_INT * T0.W, T6.X, T5.X, T0.W, BS:VEC_021/SCL_122 -; R600-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE * T1.W, T4.X, literal.x, -; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T4.X, T6.W, T4.X, PV.W, -; R600-NEXT: CNDE_INT * T2.Y, T4.W, T0.W, T1.X, BS:VEC_120/SCL_212 -; R600-NEXT: ALU clause starting at 204: +; R600-NEXT: ADD_INT T7.X, PS, literal.x, +; R600-NEXT: CNDE_INT T0.Y, T1.Z, PV.Z, PV.W, +; R600-NEXT: CNDE_INT T0.Z, PV.Y, T6.X, PV.X, +; R600-NEXT: MUL_IEEE T0.W, T4.X, literal.y, +; R600-NEXT: CNDE_INT * T1.W, T5.X, T2.Y, T1.W, +; R600-NEXT: 1065353216(1.000000e+00), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T5.X, T3.W, PS, T1.Y, +; R600-NEXT: CNDE_INT * T1.Y, T2.W, T4.X, PV.W, BS:VEC_120/SCL_212 +; R600-NEXT: ALU clause starting at 201: ; R600-NEXT: LSHL T0.Z, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE T0.W, T1.Y, T8.X, +; R600-NEXT: MUL_IEEE T0.W, T0.Y, T7.X, ; R600-NEXT: SETGT * T1.W, literal.y, KC0[3].W, ; R600-NEXT: 23(3.222986e-44), -1026650416(-1.032789e+02) -; R600-NEXT: CNDE T1.X, PS, PV.W, 0.0, -; R600-NEXT: SETGT T1.Y, KC0[3].W, literal.x, +; R600-NEXT: CNDE T4.X, PS, PV.W, 0.0, +; R600-NEXT: SETGT T0.Y, KC0[3].W, literal.x, ; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, -; R600-NEXT: CNDE_INT T0.W, T3.Y, T2.Y, T4.X, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE * T1.W, T2.X, T3.X, literal.z, +; R600-NEXT: CNDE_INT T0.W, T5.Y, T5.X, T1.Y, BS:VEC_102/SCL_221 +; R600-NEXT: CNDE * T1.W, T0.X, T3.X, literal.z, ; R600-NEXT: 1118925336(8.872284e+01), 1065353216(1.000000e+00) ; R600-NEXT: 2139095040(INF), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T2.X, PV.W, PV.Z, +; R600-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, ; R600-NEXT: SETGT T2.Y, literal.x, KC0[3].Y, ; R600-NEXT: CNDE T1.Z, PV.Y, PV.X, literal.y, -; R600-NEXT: CNDE T0.W, T0.X, T0.Y, 0.0, +; R600-NEXT: CNDE T0.W, T2.X, T1.X, 0.0, ; R600-NEXT: SETGT * T2.W, KC0[3].Z, literal.z, ; R600-NEXT: -1026650416(-1.032789e+02), 2139095040(INF) ; R600-NEXT: 1118925336(8.872284e+01), 0(0.000000e+00) @@ -2285,8 +2273,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; CM-LABEL: s_exp_v4f32: ; CM: ; %bb.0: ; CM-NEXT: ALU 97, @6, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 100, @104, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 36, @205, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 97, @104, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 35, @202, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD @@ -2305,224 +2293,220 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) ; CM-NEXT: MULADD_IEEE T0.X, T0.W, literal.x, PV.W, ; CM-NEXT: ADD T0.Y, T0.Z, -PV.Z, -; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, -; CM-NEXT: MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: MUL_IEEE T0.Z, T2.W, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: MUL_IEEE * T0.W, PV.Y, literal.x, ; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) ; CM-NEXT: TRUNC T1.X, T1.Z, -; CM-NEXT: RNDNE T2.Y, PV.W, -; CM-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z, -; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: MULADD_IEEE T1.Y, T1.Y, literal.x, PV.W, +; CM-NEXT: RNDNE T1.Z, PV.Z, +; CM-NEXT: ADD * T0.W, PV.Y, PV.X, ; CM-NEXT: 1069064192(1.442383e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: TRUNC T2.X, T1.Z, +; CM-NEXT: MULADD_IEEE T0.Y, T2.W, literal.x, T1.Y, +; CM-NEXT: FLT_TO_INT T2.Z, T1.X, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, +; CM-NEXT: 967029397(3.122284e-04), 209715200(1.972152e-31) +; CM-NEXT: ADD T1.X, T0.Z, -T1.Z, +; CM-NEXT: MUL_IEEE T1.Y, PV.W, literal.x, +; CM-NEXT: MAX_INT T0.Z, PV.Z, literal.y, +; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) +; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) +; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T0.Z, T2.Z, literal.z, +; CM-NEXT: SETGT_UINT * T1.W, T2.Z, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) +; CM-NEXT: 102(1.429324e-43), -229(nan) +; CM-NEXT: ADD_INT T4.X, T2.Z, literal.x, +; CM-NEXT: SETGT_UINT T3.Y, T2.Z, literal.y, +; CM-NEXT: CNDE_INT T0.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T2.W, T2.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: MUL_IEEE T5.X, T0.X, literal.x, +; CM-NEXT: CNDE_INT T2.Y, PV.W, PV.Z, T2.Z, +; CM-NEXT: CNDE_INT T0.Z, PV.Y, PV.X, T3.X, +; CM-NEXT: SETGT_INT * T3.W, T2.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 127(1.779649e-43) +; CM-NEXT: AND_INT T3.X, KC0[3].Z, literal.x, +; CM-NEXT: CNDE_INT T2.Y, PV.W, PV.Y, PV.Z, +; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.y, +; CM-NEXT: CNDE_INT * T0.W, T1.W, T1.Y, T0.W, +; CM-NEXT: -4096(nan), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T0.X, T2.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T1.Y, T3.Y, T5.X, PV.Z, +; CM-NEXT: LSHL T0.Z, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, +; CM-NEXT: 23(3.222986e-44), 1069064192(1.442383e+00) +; CM-NEXT: RNDNE T4.X, PV.W, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, T3.W, PV.X, PV.Y, +; CM-NEXT: ADD * T1.W, T1.X, T0.Y, +; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) ; CM-NEXT: EXP_IEEE T0.X, T1.W, ; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, ; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, ; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: MULADD_IEEE T2.X, T2.W, literal.x, T0.Z, -; CM-NEXT: ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212 -; CM-NEXT: FLT_TO_INT T0.Z, T1.X, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, -; CM-NEXT: 967029397(3.122284e-04), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T1.X, PV.W, literal.x, +; CM-NEXT: MUL_IEEE T1.X, T0.Z, T2.Y, +; CM-NEXT: TRUNC T0.Y, T4.X, +; CM-NEXT: FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212 +; CM-NEXT: MUL_IEEE * T1.W, PV.X, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T2.X, PV.W, literal.x, ; CM-NEXT: MUL_IEEE T1.Y, T0.X, literal.y, ; CM-NEXT: MAX_INT T1.Z, PV.Z, literal.z, -; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.w, +; CM-NEXT: MIN_INT * T2.W, PV.Z, literal.w, ; CM-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) ; CM-NEXT: -330(nan), 381(5.338947e-43) -; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T3.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T5.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.y, ; CM-NEXT: ADD_INT T1.Z, T0.Z, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, T0.Z, literal.w, +; CM-NEXT: SETGT_UINT * T2.W, T0.Z, literal.w, ; CM-NEXT: -254(nan), 204(2.858649e-43) ; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: ADD_INT T4.X, T0.Z, literal.x, -; CM-NEXT: SETGT_UINT T4.Y, T0.Z, literal.y, +; CM-NEXT: ADD_INT T6.X, T0.Z, literal.x, +; CM-NEXT: SETGT_UINT T3.Y, T0.Z, literal.y, ; CM-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.x, +; CM-NEXT: SETGT_INT * T3.W, T0.Z, literal.x, ; CM-NEXT: -127(nan), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T5.X, PV.W, PV.Z, T0.Z, -; CM-NEXT: CNDE_INT T3.Y, PV.Y, PV.X, T3.X, -; CM-NEXT: SETGT_INT T0.Z, T0.Z, literal.x, -; CM-NEXT: MUL_IEEE * T3.W, T1.Y, literal.y, -; CM-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) -; CM-NEXT: CNDE_INT T3.X, T4.Y, T1.Y, PV.W, -; CM-NEXT: AND_INT T1.Y, KC0[3].Z, literal.x, -; CM-NEXT: CNDE_INT T1.Z, PV.Z, PV.X, PV.Y, -; CM-NEXT: CNDE_INT * T0.W, T1.W, T1.X, T0.W, -; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.X, T2.W, PV.W, T0.X, -; CM-NEXT: LSHL T3.Y, PV.Z, literal.x, -; CM-NEXT: TRUNC T1.Z, T2.Y, -; CM-NEXT: ADD * T0.W, KC0[3].Z, -PV.Y, -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.X, PV.W, literal.x, -; CM-NEXT: FLT_TO_INT T2.Y, PV.Z, -; CM-NEXT: ADD_INT T1.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T1.W, T0.Z, PV.X, T3.X, -; CM-NEXT: 967029397(3.122284e-04), 1065353216(1.000000e+00) -; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, -; CM-NEXT: MIN_INT T3.Y, PV.Y, literal.x, -; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.y, PV.X, -; CM-NEXT: ADD * T0.W, T0.Y, T2.X, -; CM-NEXT: 381(5.338947e-43), 1069064192(1.442383e+00) -; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T0.Y, T0.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, -; CM-NEXT: MULADD_IEEE T1.X, T1.Y, literal.x, T0.Z, -; CM-NEXT: MUL_IEEE T4.Y, PV.Y, literal.y, -; CM-NEXT: ADD_INT T0.Z, T3.Y, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: MAX_INT * T0.W, T2.Y, literal.w, BS:VEC_201 -; CM-NEXT: 967029397(3.122284e-04), 2130706432(1.701412e+38) -; CM-NEXT: -254(nan), -330(nan) -; CM-NEXT: ADD_INT T2.X, T2.Y, literal.x, -; CM-NEXT: ADD_INT T3.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T1.Z, T2.Y, literal.z, -; CM-NEXT: SETGT_UINT * T0.W, T2.Y, literal.w, -; CM-NEXT: -127(nan), 204(2.858649e-43) -; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: SETGT_UINT T3.X, T2.Y, literal.x, -; CM-NEXT: CNDE_INT T3.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T1.Z, T2.Y, literal.y, -; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 254(3.559298e-43), -127(nan) -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T4.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT * T3.Y, PV.Z, PV.Y, T2.Y, -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: ALU clause starting at 104: -; CM-NEXT: CNDE_INT T0.Z, T3.X, T2.X, T0.Z, -; CM-NEXT: SETGT_INT * T2.W, T2.Y, literal.x, +; CM-NEXT: CNDE_INT T7.X, PV.W, PV.Z, T0.Z, +; CM-NEXT: CNDE_INT T2.Y, PV.Y, PV.X, T5.X, +; CM-NEXT: SETGT_INT * T0.Z, T0.Z, literal.x, ; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T2.X, T1.Y, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, T3.Y, PV.Z, -; CM-NEXT: CNDE_INT T0.Z, T0.W, T4.X, T1.W, -; CM-NEXT: MUL_IEEE * T0.W, T4.Y, literal.y, BS:VEC_201 -; CM-NEXT: 1069064192(1.442383e+00), 2130706432(1.701412e+38) -; CM-NEXT: AND_INT T4.X, KC0[4].X, literal.x, -; CM-NEXT: CNDE_INT T2.Y, T3.X, T4.Y, PV.W, -; CM-NEXT: CNDE_INT T0.Z, T1.Z, PV.Z, T0.Y, -; CM-NEXT: LSHL * T0.W, PV.Y, literal.y, -; CM-NEXT: -4096(nan), 23(3.222986e-44) -; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T0.Y, T2.W, PV.Z, PV.Y, -; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.y, -; CM-NEXT: RNDNE * T0.W, T2.X, -; CM-NEXT: 1065353216(1.000000e+00), 1069064192(1.442383e+00) -; CM-NEXT: ADD T2.X, T2.X, -PV.W, -; CM-NEXT: RNDNE T1.Y, PV.Z, -; CM-NEXT: MUL_IEEE T1.Z, PV.Y, PV.X, -; CM-NEXT: SETGT * T1.W, literal.x, KC0[3].W, -; CM-NEXT: -1026650416(-1.032789e+02), 0(0.000000e+00) -; CM-NEXT: CNDE T3.X, PV.W, PV.Z, 0.0, -; CM-NEXT: TRUNC T0.Y, T0.W, -; CM-NEXT: TRUNC T1.Z, PV.Y, -; CM-NEXT: ADD * T0.W, PV.X, T1.X, +; CM-NEXT: ALU clause starting at 104: +; CM-NEXT: ADD * T4.W, KC0[3].Z, -T3.X, +; CM-NEXT: MUL_IEEE T5.X, PV.W, literal.x, +; CM-NEXT: CNDE_INT T2.Y, T0.Z, T7.X, T2.Y, +; CM-NEXT: MUL_IEEE T1.Z, T1.Y, literal.y, +; CM-NEXT: CNDE_INT * T1.W, T2.W, T2.X, T1.W, BS:VEC_021/SCL_122 +; CM-NEXT: 967029397(3.122284e-04), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T0.X, T3.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T1.Y, T3.Y, T1.Y, PV.Z, +; CM-NEXT: LSHL T1.Z, PV.Y, literal.x, +; CM-NEXT: MULADD_IEEE * T1.W, T4.W, literal.y, PV.X, BS:VEC_120/SCL_212 +; CM-NEXT: 23(3.222986e-44), 1069064192(1.442383e+00) +; CM-NEXT: MULADD_IEEE T2.X, T3.X, literal.x, PV.W, +; CM-NEXT: ADD T2.Y, T0.W, -T4.X, +; CM-NEXT: ADD_INT T1.Z, PV.Z, literal.y, +; CM-NEXT: CNDE_INT * T0.W, T0.Z, PV.X, PV.Y, +; CM-NEXT: 967029397(3.122284e-04), 1065353216(1.000000e+00) +; CM-NEXT: AND_INT T0.X, KC0[4].X, literal.x, +; CM-NEXT: MUL_IEEE T1.Y, PV.W, PV.Z, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].W, +; CM-NEXT: ADD * T0.W, PV.Y, PV.X, +; CM-NEXT: -4096(nan), -1026650416(-1.032789e+02) ; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, ; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, ; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, ; CM-NEXT: EXP_IEEE * T0.W, T0.W, -; CM-NEXT: FLT_TO_INT T1.X, T1.Z, -; CM-NEXT: FLT_TO_INT T0.Y, T0.Y, -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: ADD * T1.W, KC0[4].X, -T4.X, -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T2.X, PV.W, literal.x, -; CM-NEXT: MUL_IEEE T2.Y, T0.W, literal.y, -; CM-NEXT: MUL_IEEE T2.Z, PV.Z, literal.z, -; CM-NEXT: SETGT_UINT * T2.W, PV.Y, literal.w, -; CM-NEXT: 967029397(3.122284e-04), 209715200(1.972152e-31) -; CM-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T5.X, PV.W, T1.Z, PV.Z, -; CM-NEXT: MUL_IEEE T3.Y, PV.Y, literal.x, -; CM-NEXT: MULADD_IEEE T1.Z, T1.W, literal.y, PV.X, -; CM-NEXT: MAX_INT * T1.W, T1.X, literal.z, -; CM-NEXT: 209715200(1.972152e-31), 1069064192(1.442383e+00) -; CM-NEXT: -330(nan), 0(0.000000e+00) -; CM-NEXT: ADD_INT T2.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T4.Y, T1.X, literal.y, -; CM-NEXT: MULADD_IEEE T1.Z, T4.X, literal.z, PV.Z, BS:VEC_120/SCL_212 -; CM-NEXT: MAX_INT * T1.W, T0.Y, literal.w, -; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; CM-NEXT: CNDE T2.X, T0.Z, T1.Y, 0.0, +; CM-NEXT: ADD T1.Y, KC0[4].X, -T0.X, +; CM-NEXT: FLT_TO_INT T0.Z, T0.Y, +; CM-NEXT: MUL_IEEE * T1.W, PV.W, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, +; CM-NEXT: SETGT_UINT T0.Y, PV.Z, literal.y, +; CM-NEXT: MUL_IEEE T1.Z, PV.Y, literal.z, +; CM-NEXT: MUL_IEEE * T2.W, T0.X, literal.w, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) +; CM-NEXT: 967029397(3.122284e-04), 1069064192(1.442383e+00) +; CM-NEXT: RNDNE T4.X, PV.W, +; CM-NEXT: MULADD_IEEE T1.Y, T1.Y, literal.x, PV.Z, +; CM-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, T1.W, +; CM-NEXT: SETGT_INT * T1.W, T0.Z, literal.y, +; CM-NEXT: 1069064192(1.442383e+00), -127(nan) +; CM-NEXT: CNDE_INT T3.X, PV.W, PV.Z, T0.W, +; CM-NEXT: MULADD_IEEE T1.Y, T0.X, literal.x, PV.Y, +; CM-NEXT: ADD T1.Z, T2.W, -PV.X, +; CM-NEXT: MAX_INT * T2.W, T0.Z, literal.y, ; CM-NEXT: 967029397(3.122284e-04), -330(nan) -; CM-NEXT: ADD T4.X, T0.Z, -T1.Y, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.x, -; CM-NEXT: ADD_INT T0.Z, T0.Y, literal.y, -; CM-NEXT: SETGT_UINT * T1.W, T0.Y, literal.z, +; CM-NEXT: ADD_INT T0.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, T0.Z, literal.y, +; CM-NEXT: TRUNC T2.Z, T4.X, +; CM-NEXT: ADD * T2.W, PV.Z, PV.Y, ; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) -; CM-NEXT: -229(nan), 0(0.000000e+00) -; CM-NEXT: SETGT_UINT T6.X, T1.X, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T0.Z, T0.Y, literal.y, -; CM-NEXT: ADD * T3.W, PV.X, T1.Z, -; CM-NEXT: -229(nan), -127(nan) -; CM-NEXT: EXP_IEEE T1.X (MASKED), T3.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T3.W, -; CM-NEXT: EXP_IEEE T1.Z, T3.W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T3.W, -; CM-NEXT: CNDE_INT T4.X, T0.Z, T1.Y, T0.Y, -; CM-NEXT: CNDE_INT T1.Y, T6.X, T2.X, T4.Y, BS:VEC_120/SCL_212 -; CM-NEXT: SETGT_INT T2.Z, T1.X, literal.x, -; CM-NEXT: MUL_IEEE * T3.W, PV.Z, literal.y, -; CM-NEXT: -127(nan), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T2.X, T1.Z, literal.x, -; CM-NEXT: MUL_IEEE T4.Y, PV.W, literal.y, -; CM-NEXT: CNDE_INT T3.Z, PV.Z, PV.Y, T1.X, -; CM-NEXT: MIN_INT * T4.W, T1.X, literal.z, +; CM-NEXT: EXP_IEEE T1.X (MASKED), T2.W, +; CM-NEXT: EXP_IEEE T1.Y, T2.W, +; CM-NEXT: EXP_IEEE T1.Z (MASKED), T2.W, +; CM-NEXT: EXP_IEEE * T1.W (MASKED), T2.W, +; CM-NEXT: MUL_IEEE T4.X, T0.W, literal.x, +; CM-NEXT: FLT_TO_INT T3.Y, T2.Z, +; CM-NEXT: MUL_IEEE T1.Z, PV.Y, literal.y, +; CM-NEXT: CNDE_INT * T0.W, T0.Y, T0.X, T2.Y, ; CM-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; CM-NEXT: CNDE_INT T0.X, T1.W, PV.W, T0.Z, +; CM-NEXT: MUL_IEEE T0.Y, PV.Z, literal.x, +; CM-NEXT: MAX_INT T2.Z, PV.Y, literal.y, +; CM-NEXT: MIN_INT * T0.W, PV.Y, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) ; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; CM-NEXT: MIN_INT T7.X, T0.Y, literal.x, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T4.Z, T1.X, literal.z, -; CM-NEXT: SETGT_UINT * T4.W, T1.X, literal.w, -; CM-NEXT: 381(5.338947e-43), -254(nan) +; CM-NEXT: ADD_INT T5.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T2.Z, T3.Y, literal.z, +; CM-NEXT: SETGT_UINT * T0.W, T3.Y, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) +; CM-NEXT: 102(1.429324e-43), -229(nan) +; CM-NEXT: ADD_INT T6.X, T3.Y, literal.x, +; CM-NEXT: SETGT_UINT T4.Y, T3.Y, literal.y, +; CM-NEXT: CNDE_INT T2.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T1.W, T3.Y, literal.x, ; CM-NEXT: -127(nan), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T8.X, PV.W, PV.Z, PV.Y, -; CM-NEXT: SETGT_INT T1.Y, T1.X, literal.x, -; CM-NEXT: ADD_INT T4.Z, PV.X, literal.y, -; CM-NEXT: ADD_INT * T5.W, T0.Y, literal.z, +; CM-NEXT: MUL_IEEE T7.X, T1.Y, literal.x, +; CM-NEXT: CNDE_INT T2.Y, PV.W, PV.Z, T3.Y, +; CM-NEXT: CNDE_INT T2.Z, PV.Y, PV.X, T5.X, +; CM-NEXT: MIN_INT * T2.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; CM-NEXT: SETGT_INT T5.X, T3.Y, literal.x, +; CM-NEXT: ADD_INT T3.Y, PV.W, literal.y, +; CM-NEXT: ADD_INT T3.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T2.W, T0.Z, literal.w, ; CM-NEXT: 127(1.779649e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T1.X, T2.W, PV.W, PV.Z, -; CM-NEXT: CNDE_INT T5.Y, PV.Y, T3.Z, PV.X, -; CM-NEXT: CNDE_INT T3.Z, T6.X, T4.Y, T3.W, -; CM-NEXT: MUL_IEEE * T2.W, T2.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: CNDE_INT T6.X, PV.W, PV.Z, PV.Y, +; CM-NEXT: CNDE_INT T2.Y, PV.X, T2.Y, T2.Z, +; CM-NEXT: MUL_IEEE T2.Z, T7.X, literal.x, +; CM-NEXT: CNDE_INT * T0.W, T0.W, T0.Y, T1.Z, BS:VEC_021/SCL_122 ; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_INT T6.X, T0.Y, literal.x, -; CM-NEXT: CNDE_INT T0.Y, T4.W, T2.X, PV.W, -; CM-NEXT: CNDE_INT * T1.Z, T2.Z, PV.Z, T1.Z, -; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: ALU clause starting at 205: -; CM-NEXT: LSHL * T2.W, T5.Y, literal.x, -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: ADD_INT T2.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T0.Y, T1.Y, T1.Z, T0.Y, -; CM-NEXT: CNDE_INT * T1.Z, T6.X, T4.X, T1.X, +; CM-NEXT: SETGT_INT T8.X, T0.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Y, T1.W, PV.W, T1.Y, +; CM-NEXT: CNDE_INT T0.Z, T4.Y, T7.X, PV.Z, +; CM-NEXT: LSHL * T0.W, PV.Y, literal.y, +; CM-NEXT: 127(1.779649e-43), 23(3.222986e-44) +; CM-NEXT: ALU clause starting at 202: +; CM-NEXT: ADD_INT T7.X, T0.W, literal.x, +; CM-NEXT: CNDE_INT * T0.Y, T5.X, T0.Y, T0.Z, ; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) -; CM-NEXT: CNDE_INT * T1.W, T1.W, T3.Y, T2.Y, -; CM-NEXT: CNDE_INT T1.X, T0.Z, PV.W, T0.W, -; CM-NEXT: LSHL T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: MUL_IEEE T0.Z, T0.Y, T2.X, +; CM-NEXT: CNDE_INT * T0.Z, T8.X, T0.X, T6.X, +; CM-NEXT: MUL_IEEE * T0.W, T4.X, literal.x, +; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; CM-NEXT: CNDE_INT T0.X, T2.W, T4.X, PV.W, +; CM-NEXT: LSHL T1.Y, T0.Z, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, T0.Y, T7.X, BS:VEC_021/SCL_122 ; CM-NEXT: SETGT * T0.W, literal.y, KC0[4].X, ; CM-NEXT: 23(3.222986e-44), -1026650416(-1.032789e+02) -; CM-NEXT: CNDE T2.X, PV.W, PV.Z, 0.0, +; CM-NEXT: CNDE T4.X, PV.W, PV.Z, 0.0, ; CM-NEXT: SETGT T0.Y, KC0[4].X, literal.x, ; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T6.X, PV.X, T5.X, +; CM-NEXT: CNDE_INT * T0.W, T8.X, T3.X, PV.X, ; CM-NEXT: 1118925336(8.872284e+01), 1065353216(1.000000e+00) -; CM-NEXT: SETGT T1.X, KC0[3].W, literal.x, +; CM-NEXT: SETGT T0.X, KC0[3].W, literal.x, ; CM-NEXT: MUL_IEEE T1.Y, PV.W, PV.Z, ; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].Z, ; CM-NEXT: CNDE * T0.W, PV.Y, PV.X, literal.z, ; CM-NEXT: 1118925336(8.872284e+01), -1026650416(-1.032789e+02) ; CM-NEXT: 2139095040(INF), 0(0.000000e+00) -; CM-NEXT: SETGT T2.X, literal.x, KC0[3].Y, +; CM-NEXT: SETGT T3.X, literal.x, KC0[3].Y, ; CM-NEXT: CNDE T0.Y, PV.Z, PV.Y, 0.0, -; CM-NEXT: CNDE T0.Z, PV.X, T3.X, literal.y, +; CM-NEXT: CNDE T0.Z, PV.X, T2.X, literal.y, ; CM-NEXT: SETGT * T1.W, KC0[3].Z, literal.z, ; CM-NEXT: -1026650416(-1.032789e+02), 2139095040(INF) ; CM-NEXT: 1118925336(8.872284e+01), 0(0.000000e+00) ; CM-NEXT: CNDE T0.Y, PV.W, PV.Y, literal.x, -; CM-NEXT: CNDE T1.Z, PV.X, T0.X, 0.0, +; CM-NEXT: CNDE T1.Z, PV.X, T1.X, 0.0, ; CM-NEXT: SETGT * T1.W, KC0[3].Y, literal.y, ; CM-NEXT: 2139095040(INF), 1118925336(8.872284e+01) ; CM-NEXT: CNDE * T0.X, PV.W, PV.Z, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 544c1de6c7bb..a16294958748 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -230,23 +230,23 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; R600-NEXT: MUL_IEEE * T2.W, PS, literal.z, ; R600-NEXT: -127(nan), 254(3.559298e-43) ; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T3.X, T1.X, literal.x, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.y, +; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T0.Y, T1.X, literal.y, ; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T0.Z, ; R600-NEXT: CNDE_INT T3.W, PV.Y, PV.X, T0.X, ; R600-NEXT: SETGT_INT * T4.W, T0.Z, literal.z, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) ; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) ; R600-NEXT: CNDE_INT T0.Z, PS, PV.Z, PV.W, -; R600-NEXT: CNDE_INT T0.W, T0.W, PV.Y, T2.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.X, literal.x, +; R600-NEXT: MUL_IEEE T3.W, PV.Y, literal.x, +; R600-NEXT: CNDE_INT * T0.W, T0.W, PV.X, T2.W, ; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T1.Z, T1.Y, T3.X, PS, -; R600-NEXT: CNDE_INT T0.W, T1.W, PV.W, T1.X, +; R600-NEXT: CNDE_INT T1.Z, T1.W, PS, T1.X, +; R600-NEXT: CNDE_INT T0.W, T1.Y, T0.Y, PV.W, ; R600-NEXT: LSHL * T1.W, PV.Z, literal.x, ; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) ; R600-NEXT: ADD_INT T1.W, PS, literal.x, -; R600-NEXT: CNDE_INT * T0.W, T4.W, PV.W, PV.Z, +; R600-NEXT: CNDE_INT * T0.W, T4.W, PV.Z, PV.W, ; R600-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) ; R600-NEXT: MUL_IEEE T0.W, PS, PV.W, ; R600-NEXT: SETGT * T1.W, literal.x, KC0[2].Z, @@ -260,65 +260,63 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; CM-LABEL: s_exp10_f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 64, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 62, @4, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, ; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, ; CM-NEXT: ADD * T1.W, KC0[2].Z, -PV.W, -; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: RNDNE * T2.W, PV.Z, -; CM-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; CM-NEXT: TRUNC T2.Z, PV.W, +; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, +; CM-NEXT: MUL_IEEE * T2.W, T0.W, literal.y, +; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; CM-NEXT: RNDNE T1.Z, PV.W, ; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z, ; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; CM-NEXT: MULADD_IEEE T0.Y, T0.W, literal.x, PV.W, -; CM-NEXT: ADD T0.Z, T0.Z, -T2.W, -; CM-NEXT: FLT_TO_INT * T0.W, PV.Z, +; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.x, PV.W, +; CM-NEXT: ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212 ; CM-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; CM-NEXT: MIN_INT T1.Z, PV.W, literal.x, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, +; CM-NEXT: TRUNC T1.Z, T1.Z, +; CM-NEXT: ADD * T0.W, PV.W, PV.Z, +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: FLT_TO_INT T0.Z, T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; CM-NEXT: MAX_INT T1.Z, PV.Z, literal.y, +; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) ; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T0.X, T1.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: MUL_IEEE T0.Y, PV.X, literal.x, -; CM-NEXT: ADD_INT T0.Z, T1.Z, literal.y, -; CM-NEXT: MAX_INT * T1.W, T0.W, literal.z, -; CM-NEXT: 2130706432(1.701412e+38), -254(nan) -; CM-NEXT: -330(nan), 0(0.000000e+00) -; CM-NEXT: ADD_INT T1.X, T0.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T1.Z, T0.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, T0.W, literal.w, -; CM-NEXT: -127(nan), 204(2.858649e-43) +; CM-NEXT: ADD_INT T1.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T1.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T1.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T1.W, T0.Z, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) ; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: SETGT_UINT T2.X, T0.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T1.Z, T0.W, literal.y, -; CM-NEXT: MUL_IEEE * T2.W, T0.X, literal.z, -; CM-NEXT: 254(3.559298e-43), -127(nan) -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.Z, PV.Y, T0.W, -; CM-NEXT: CNDE_INT T0.Z, PV.X, T1.X, T0.Z, -; CM-NEXT: SETGT_INT * T0.W, T0.W, literal.y, -; CM-NEXT: 209715200(1.972152e-31), 127(1.779649e-43) +; CM-NEXT: ADD_INT T2.X, T0.Z, literal.x, +; CM-NEXT: SETGT_UINT T2.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: MUL_IEEE T3.X, T0.X, literal.x, +; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, T0.Z, +; CM-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, T1.X, +; CM-NEXT: SETGT_INT * T3.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 127(1.779649e-43) ; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: CNDE_INT T0.Z, T1.W, PV.X, T2.W, -; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.x, +; CM-NEXT: CNDE_INT * T0.W, T1.W, T0.Y, T0.W, ; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.Y, T2.X, T0.Y, PV.W, -; CM-NEXT: CNDE_INT T0.Z, T1.Z, PV.Z, T0.X, -; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, +; CM-NEXT: CNDE_INT T0.Y, T2.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T0.Z, T2.Y, T3.X, PV.Z, +; CM-NEXT: LSHL * T0.W, PV.Y, literal.x, ; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) ; CM-NEXT: ADD_INT T1.Z, PV.W, literal.x, -; CM-NEXT: CNDE_INT * T0.W, T0.W, PV.Z, PV.Y, +; CM-NEXT: CNDE_INT * T0.W, T3.W, PV.Y, PV.Z, ; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) ; CM-NEXT: MUL_IEEE T0.Z, PV.W, PV.Z, ; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, @@ -612,105 +610,105 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; R600-NEXT: AND_INT * T0.W, KC0[3].X, literal.x, ; R600-NEXT: -4096(nan), 0(0.000000e+00) ; R600-NEXT: ADD * T1.W, KC0[3].X, -PV.W, -; R600-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; R600-NEXT: MUL_IEEE T2.W, PV.W, literal.y, -; R600-NEXT: MUL_IEEE * T3.W, T0.W, literal.z, -; R600-NEXT: -4096(nan), 975668412(6.390323e-04) -; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T1.Z, PS, +; R600-NEXT: MUL_IEEE T2.W, PV.W, literal.x, +; R600-NEXT: MUL_IEEE * T3.W, T0.W, literal.y, +; R600-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; R600-NEXT: RNDNE T0.Z, PS, ; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W, -; R600-NEXT: ADD * T2.W, KC0[2].W, -PV.Z, -; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: MUL_IEEE T2.Z, T0.Z, literal.y, +; R600-NEXT: AND_INT * T2.W, KC0[2].W, literal.y, +; R600-NEXT: 1079283712(3.321289e+00), -4096(nan) +; R600-NEXT: ADD T1.Z, KC0[2].W, -PS, ; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PV.W, ; R600-NEXT: ADD * T1.W, T3.W, -PV.Z, +; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) +; R600-NEXT: ADD T2.Z, PS, PV.W, +; R600-NEXT: MUL_IEEE T0.W, PV.Z, literal.x, +; R600-NEXT: MUL_IEEE * T1.W, T2.W, literal.y, ; R600-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) -; R600-NEXT: ADD T3.Z, PS, PV.W, -; R600-NEXT: RNDNE T0.W, PV.Z, -; R600-NEXT: MULADD_IEEE * T1.W, T2.W, literal.x, PV.Y, BS:VEC_021/SCL_122 -; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: TRUNC T0.Y, T1.Z, -; R600-NEXT: MULADD_IEEE T0.Z, T0.Z, literal.x, PS, BS:VEC_120/SCL_212 -; R600-NEXT: ADD T1.W, T2.Z, -PV.W, BS:VEC_201 +; R600-NEXT: RNDNE T0.Y, PS, +; R600-NEXT: MULADD_IEEE T1.Z, T1.Z, literal.x, PV.W, +; R600-NEXT: TRUNC T0.W, T0.Z, BS:VEC_120/SCL_212 ; R600-NEXT: EXP_IEEE * T0.X, PV.Z, -; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: ADD T0.Z, PV.W, PV.Z, -; R600-NEXT: FLT_TO_INT T1.W, PV.Y, -; R600-NEXT: MUL_IEEE * T2.W, PS, literal.x, -; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T1.Z, PS, literal.x, -; R600-NEXT: SETGT_UINT T3.W, PV.W, literal.y, -; R600-NEXT: EXP_IEEE * T0.Y, PV.Z, -; R600-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; R600-NEXT: CNDE_INT T1.X, PV.W, T2.W, PV.Z, -; R600-NEXT: MUL_IEEE T1.Y, PS, literal.x, -; R600-NEXT: MAX_INT T0.Z, T1.W, literal.y, -; R600-NEXT: MIN_INT T2.W, T1.W, literal.z, -; R600-NEXT: TRUNC * T0.W, T0.W, +; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) +; R600-NEXT: FLT_TO_INT T1.Y, PV.W, +; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, +; R600-NEXT: MULADD_IEEE T0.W, T2.W, literal.y, PV.Z, +; R600-NEXT: ADD * T1.W, T1.W, -PV.Y, +; R600-NEXT: 209715200(1.972152e-31), 975668412(6.390323e-04) +; R600-NEXT: ADD T1.Z, PS, PV.W, +; R600-NEXT: MUL_IEEE T0.W, PV.Z, literal.x, +; R600-NEXT: SETGT_UINT * T1.W, PV.Y, literal.y, +; R600-NEXT: 209715200(1.972152e-31), -229(nan) +; R600-NEXT: CNDE_INT T0.Z, PS, PV.W, T0.Z, +; R600-NEXT: SETGT_INT T0.W, T1.Y, literal.x, +; R600-NEXT: EXP_IEEE * T1.X, PV.Z, +; R600-NEXT: -127(nan), 0(0.000000e+00) +; R600-NEXT: CNDE_INT T0.Z, PV.W, PV.Z, T0.X, +; R600-NEXT: MAX_INT T2.W, T1.Y, literal.x, +; R600-NEXT: MUL_IEEE * T3.W, PS, literal.y, +; R600-NEXT: -330(nan), 209715200(1.972152e-31) +; R600-NEXT: MUL_IEEE T2.X, PS, literal.x, +; R600-NEXT: ADD_INT T2.Y, PV.W, literal.y, +; R600-NEXT: ADD_INT T1.Z, T1.Y, literal.z, +; R600-NEXT: MIN_INT T2.W, T1.Y, literal.w, +; R600-NEXT: TRUNC * T4.W, T0.Y, +; R600-NEXT: 209715200(1.972152e-31), 204(2.858649e-43) +; R600-NEXT: 102(1.429324e-43), 381(5.338947e-43) +; R600-NEXT: FLT_TO_INT T3.X, PS, +; R600-NEXT: ADD_INT T0.Y, PV.W, literal.x, +; R600-NEXT: ADD_INT T2.Z, T1.Y, literal.y, +; R600-NEXT: SETGT_UINT T2.W, T1.Y, literal.z, +; R600-NEXT: CNDE_INT * T1.W, T1.W, PV.Y, PV.Z, +; R600-NEXT: -254(nan), -127(nan) +; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T4.X, T1.X, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: CNDE_INT T1.Z, T0.W, PS, T1.Y, +; R600-NEXT: CNDE_INT T0.W, PV.W, PV.Z, PV.Y, +; R600-NEXT: MAX_INT * T1.W, PV.X, literal.y, ; R600-NEXT: 2130706432(1.701412e+38), -330(nan) -; R600-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; R600-NEXT: FLT_TO_INT T2.X, PS, -; R600-NEXT: ADD_INT T2.Y, PV.W, literal.x, -; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, -; R600-NEXT: ADD_INT T0.W, T1.W, literal.z, -; R600-NEXT: SETGT_UINT * T2.W, T1.W, literal.w, -; R600-NEXT: -254(nan), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: ADD_INT T3.X, T1.W, literal.x, -; R600-NEXT: CNDE_INT T3.Y, PS, PV.Z, PV.W, -; R600-NEXT: SETGT_INT T0.Z, T1.W, literal.x, -; R600-NEXT: MUL_IEEE T0.W, T0.X, literal.y, -; R600-NEXT: MUL_IEEE * T4.W, T0.Y, literal.y, -; R600-NEXT: -127(nan), 209715200(1.972152e-31) -; R600-NEXT: MUL_IEEE T4.X, PS, literal.x, -; R600-NEXT: MUL_IEEE T4.Y, PV.W, literal.x, -; R600-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, T1.W, -; R600-NEXT: CNDE_INT T3.W, T3.W, PV.X, T2.Y, -; R600-NEXT: MAX_INT * T5.W, T2.X, literal.y, -; R600-NEXT: 209715200(1.972152e-31), -330(nan) -; R600-NEXT: SETGT_INT T3.X, T1.W, literal.x, -; R600-NEXT: ADD_INT T2.Y, PS, literal.y, -; R600-NEXT: ADD_INT T2.Z, T2.X, literal.z, -; R600-NEXT: SETGT_UINT * T1.W, T2.X, literal.w, +; R600-NEXT: SETGT_INT T0.X, T1.Y, literal.x, +; R600-NEXT: ADD_INT T0.Y, PS, literal.y, +; R600-NEXT: ADD_INT T2.Z, T3.X, literal.z, +; R600-NEXT: SETGT_UINT * T1.W, T3.X, literal.w, ; R600-NEXT: 127(1.779649e-43), 204(2.858649e-43) ; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: MIN_INT * T5.W, T2.X, literal.x, +; R600-NEXT: MIN_INT * T4.W, T3.X, literal.x, ; R600-NEXT: 381(5.338947e-43), 0(0.000000e+00) ; R600-NEXT: ADD_INT T5.X, PV.W, literal.x, -; R600-NEXT: ADD_INT T3.Y, T2.X, literal.y, -; R600-NEXT: SETGT_UINT T3.Z, T2.X, literal.z, -; R600-NEXT: CNDE_INT T5.W, T1.W, T2.Y, T2.Z, -; R600-NEXT: SETGT_INT * T6.W, T2.X, literal.y, +; R600-NEXT: ADD_INT T1.Y, T3.X, literal.y, +; R600-NEXT: SETGT_UINT T3.Z, T3.X, literal.z, +; R600-NEXT: CNDE_INT T4.W, T1.W, T0.Y, T2.Z, +; R600-NEXT: SETGT_INT * T5.W, T3.X, literal.y, ; R600-NEXT: -254(nan), -127(nan) ; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T6.X, PS, PV.W, T2.X, -; R600-NEXT: CNDE_INT T2.Y, PV.Z, PV.Y, PV.X, -; R600-NEXT: SETGT_INT T2.Z, T2.X, literal.x, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T3.W, T3.X, T1.Z, T3.W, BS:VEC_021/SCL_122 -; R600-NEXT: CNDE_INT * T0.W, T2.W, T4.Y, T0.W, -; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T0.X, T0.Z, PS, T0.X, -; R600-NEXT: LSHL T3.Y, PV.W, literal.x, -; R600-NEXT: CNDE_INT T0.Z, PV.Z, PV.X, PV.Y, -; R600-NEXT: CNDE_INT T0.W, T1.W, T4.X, T4.W, -; R600-NEXT: MUL_IEEE * T1.W, T1.Y, literal.y, +; R600-NEXT: CNDE_INT T6.X, PS, PV.W, T3.X, +; R600-NEXT: CNDE_INT T0.Y, PV.Z, PV.Y, PV.X, +; R600-NEXT: SETGT_INT T2.Z, T3.X, literal.x, +; R600-NEXT: CNDE_INT T0.W, T0.X, T1.Z, T0.W, BS:VEC_120/SCL_212 +; R600-NEXT: MUL_IEEE * T4.W, T2.Y, literal.y, +; R600-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T3.X, T2.W, T2.Y, PS, BS:VEC_120/SCL_212 +; R600-NEXT: LSHL T1.Y, PV.W, literal.x, +; R600-NEXT: CNDE_INT T1.Z, PV.Z, PV.X, PV.Y, +; R600-NEXT: MUL_IEEE T0.W, T4.X, literal.y, +; R600-NEXT: CNDE_INT * T1.W, T1.W, T2.X, T3.W, ; R600-NEXT: 23(3.222986e-44), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T2.X, T3.Z, T1.Y, PS, -; R600-NEXT: CNDE_INT T0.Y, T6.W, PV.W, T0.Y, -; R600-NEXT: LSHL T0.Z, PV.Z, literal.x, +; R600-NEXT: CNDE_INT T1.X, T5.W, PS, T1.X, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T0.Y, T3.Z, T4.X, PV.W, BS:VEC_201 +; R600-NEXT: LSHL T1.Z, PV.Z, literal.x, ; R600-NEXT: ADD_INT T0.W, PV.Y, literal.y, -; R600-NEXT: CNDE_INT * T1.W, T3.X, PV.X, T1.X, +; R600-NEXT: CNDE_INT * T1.W, T0.X, T0.Z, PV.X, ; R600-NEXT: 23(3.222986e-44), 1065353216(1.000000e+00) ; R600-NEXT: MUL_IEEE T1.Y, PS, PV.W, -; R600-NEXT: SETGT T1.Z, literal.x, KC0[3].X, +; R600-NEXT: SETGT T0.Z, literal.x, KC0[3].X, ; R600-NEXT: ADD_INT * T0.W, PV.Z, literal.y, ; R600-NEXT: -1036817932(-4.485347e+01), 1065353216(1.000000e+00) ; R600-NEXT: ALU clause starting at 101: -; R600-NEXT: CNDE_INT * T1.W, T2.Z, T0.Y, T2.X, +; R600-NEXT: CNDE_INT * T1.W, T2.Z, T1.X, T0.Y, ; R600-NEXT: MUL_IEEE T0.Y, PV.W, T0.W, -; R600-NEXT: SETGT T0.Z, literal.x, KC0[2].W, -; R600-NEXT: CNDE T0.W, T1.Z, T1.Y, 0.0, +; R600-NEXT: SETGT T1.Z, literal.x, KC0[2].W, +; R600-NEXT: CNDE T0.W, T0.Z, T1.Y, 0.0, ; R600-NEXT: SETGT * T1.W, KC0[3].X, literal.y, ; R600-NEXT: -1036817932(-4.485347e+01), 1109008539(3.853184e+01) ; R600-NEXT: CNDE T1.Y, PS, PV.W, literal.x, @@ -723,118 +721,116 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; CM-LABEL: s_exp10_v2f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 100, @4, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 18, @105, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 98, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 18, @103, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: ALU clause starting at 4: ; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, ; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, ; CM-NEXT: ADD * T1.W, KC0[2].W, -PV.W, +; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, T0.W, literal.y, +; CM-NEXT: AND_INT * T2.W, KC0[3].X, literal.z, +; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; CM-NEXT: -4096(nan), 0(0.000000e+00) +; CM-NEXT: ADD T1.Y, KC0[3].X, -PV.W, +; CM-NEXT: RNDNE T1.Z, PV.Z, +; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.Y, ; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: RNDNE * T2.W, PV.Z, -; CM-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; CM-NEXT: TRUNC T0.Y, PV.W, -; CM-NEXT: AND_INT T2.Z, KC0[3].X, literal.x, -; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.y, PV.Z, -; CM-NEXT: -4096(nan), 1079283712(3.321289e+00) ; CM-NEXT: MULADD_IEEE T0.X, T0.W, literal.x, PV.W, -; CM-NEXT: MUL_IEEE T1.Y, PV.Z, literal.y, -; CM-NEXT: FLT_TO_INT T1.Z, PV.Y, -; CM-NEXT: ADD * T0.W, KC0[3].X, -PV.Z, +; CM-NEXT: ADD T0.Y, T0.Z, -PV.Z, +; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212 ; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) -; CM-NEXT: ADD T1.X, T0.Z, -T2.W, -; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, -; CM-NEXT: MAX_INT T0.Z, PV.Z, literal.y, -; CM-NEXT: RNDNE * T1.W, PV.Y, -; CM-NEXT: 975668412(6.390323e-04), -330(nan) -; CM-NEXT: TRUNC T2.X, PV.W, -; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.x, -; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.y, PV.Y, -; CM-NEXT: ADD * T0.W, PV.X, T0.X, -; CM-NEXT: 204(2.858649e-43), 1079283712(3.321289e+00) -; CM-NEXT: EXP_IEEE T0.X, T0.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, -; CM-NEXT: ADD_INT T1.X, T1.Z, literal.x, -; CM-NEXT: MULADD_IEEE T0.Y, T2.Z, literal.y, T0.Z, BS:VEC_102/SCL_221 -; CM-NEXT: ADD T0.Z, T1.Y, -T1.W, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.z, -; CM-NEXT: 102(1.429324e-43), 975668412(6.390323e-04) -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_UINT T3.X, T1.Z, literal.x, -; CM-NEXT: MUL_IEEE T1.Y, PV.W, literal.y, -; CM-NEXT: SETGT_UINT T2.Z, T1.Z, literal.z, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, -; CM-NEXT: -229(nan), 2130706432(1.701412e+38) -; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: TRUNC T1.X, T1.Z, +; CM-NEXT: RNDNE T2.Y, PV.W, +; CM-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z, +; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T1.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: MULADD_IEEE T2.X, T2.W, literal.x, T0.Z, +; CM-NEXT: ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212 +; CM-NEXT: FLT_TO_INT T0.Z, T1.X, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, +; CM-NEXT: 975668412(6.390323e-04), 209715200(1.972152e-31) +; CM-NEXT: MUL_IEEE T1.X, PV.W, literal.x, +; CM-NEXT: SETGT_UINT T1.Y, PV.Z, literal.y, +; CM-NEXT: TRUNC T1.Z, T2.Y, +; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) ; CM-NEXT: EXP_IEEE T0.X (MASKED), T1.W, ; CM-NEXT: EXP_IEEE T0.Y, T1.W, ; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, ; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: CNDE_INT T4.X, T2.Z, T0.W, T1.Y, -; CM-NEXT: CNDE_INT T1.Y, T3.X, T2.Y, T1.X, -; CM-NEXT: FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212 -; CM-NEXT: MUL_IEEE * T0.W, PV.Y, literal.x, -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_INT T1.X, T1.Z, literal.x, -; CM-NEXT: MUL_IEEE T2.Y, T0.X, literal.y, -; CM-NEXT: MUL_IEEE T3.Z, PV.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, PV.Z, literal.w, -; CM-NEXT: -127(nan), 209715200(1.972152e-31) -; CM-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T2.X, PV.W, T0.W, PV.Z, +; CM-NEXT: FLT_TO_INT T2.X, T1.Z, +; CM-NEXT: MUL_IEEE T2.Y, PV.Y, literal.x, +; CM-NEXT: CNDE_INT T1.Z, T1.Y, T1.X, T0.W, +; CM-NEXT: SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 209715200(1.972152e-31), -127(nan) +; CM-NEXT: CNDE_INT T1.X, PV.W, PV.Z, T0.X, ; CM-NEXT: MUL_IEEE T3.Y, PV.Y, literal.x, -; CM-NEXT: CNDE_INT T3.Z, PV.X, T1.Y, T1.Z, -; CM-NEXT: MAX_INT * T0.W, T0.Z, literal.y, -; CM-NEXT: 209715200(1.972152e-31), -330(nan) -; CM-NEXT: ADD_INT T5.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, T0.Z, literal.y, -; CM-NEXT: SETGT_UINT T4.Z, T0.Z, literal.z, -; CM-NEXT: MUL_IEEE * T0.W, T0.Y, literal.w, +; CM-NEXT: SETGT_UINT T1.Z, PV.X, literal.y, +; CM-NEXT: MAX_INT * T1.W, T0.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) +; CM-NEXT: -330(nan), 0(0.000000e+00) +; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T4.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T2.Z, PV.Z, PV.Y, T2.Y, +; CM-NEXT: SETGT_INT * T1.W, T2.X, literal.z, ; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) -; CM-NEXT: -229(nan), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T6.X, PV.W, literal.x, -; CM-NEXT: MIN_INT T4.Y, T0.Z, literal.y, -; CM-NEXT: CNDE_INT T5.Z, PV.Z, PV.X, PV.Y, -; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.z, -; CM-NEXT: 209715200(1.972152e-31), 381(5.338947e-43) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T5.X, PV.W, PV.Z, T0.Z, -; CM-NEXT: MIN_INT T1.Y, T1.Z, literal.x, -; CM-NEXT: ADD_INT T5.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T3.W, T0.Z, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 381(5.338947e-43), -254(nan) ; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T7.X, T1.W, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T4.Y, T0.Z, literal.x, -; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T1.W, T1.Z, literal.z, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT T4.X, PV.W, PV.Z, T0.Y, +; CM-NEXT: MUL_IEEE T2.Y, T0.X, literal.x, +; CM-NEXT: MAX_INT T2.Z, T2.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT * T2.W, T1.Y, PV.X, PV.Y, +; CM-NEXT: 2130706432(1.701412e+38), -330(nan) +; CM-NEXT: CNDE_INT T0.X, T0.W, PV.W, T0.Z, +; CM-NEXT: ADD_INT T1.Y, PV.Z, literal.x, +; CM-NEXT: ADD_INT T2.Z, T2.X, literal.y, +; CM-NEXT: MIN_INT * T0.W, T2.X, literal.z, +; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) +; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T3.Y, T2.X, literal.y, +; CM-NEXT: SETGT_UINT T3.Z, T2.X, literal.z, +; CM-NEXT: CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z, +; CM-NEXT: -254(nan), -127(nan) +; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T5.X, T0.Y, literal.x, +; CM-NEXT: CNDE_INT T0.Y, T1.W, PV.W, T2.X, +; CM-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, PV.X, +; CM-NEXT: MIN_INT * T0.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; CM-NEXT: SETGT_INT T2.X, T2.X, literal.x, +; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, +; CM-NEXT: ADD_INT T2.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T0.W, T0.Z, literal.w, ; CM-NEXT: 127(1.779649e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T8.X, T2.Z, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T5.X, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T4.Z, T6.X, T0.W, BS:VEC_201 -; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T5.X, T2.W, PV.W, T0.Y, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: CNDE_INT T3.X, PV.W, PV.Z, PV.Y, +; CM-NEXT: SETGT_INT T1.Y, T0.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, PV.X, T0.Y, T1.Z, +; CM-NEXT: MUL_IEEE * T1.W, T5.X, literal.y, +; CM-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T5.X, T3.Z, T5.X, PV.W, ; CM-NEXT: LSHL T0.Y, PV.Z, literal.x, -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T3.Z, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T3.X, T3.Y, T2.Y, BS:VEC_201 -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.X, T1.X, PV.W, T0.X, +; CM-NEXT: CNDE_INT T0.Z, PV.Y, T0.X, PV.X, BS:VEC_021/SCL_122 +; CM-NEXT: MUL_IEEE * T1.W, T2.Y, literal.y, +; CM-NEXT: 23(3.222986e-44), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T0.X, T0.W, T2.Y, PV.W, ; CM-NEXT: LSHL T2.Y, PV.Z, literal.x, ; CM-NEXT: ADD_INT * T0.Z, PV.Y, literal.y, ; CM-NEXT: 23(3.222986e-44), 1065353216(1.000000e+00) -; CM-NEXT: ALU clause starting at 105: -; CM-NEXT: CNDE_INT * T0.W, T4.Y, T5.X, T2.X, -; CM-NEXT: MUL_IEEE T1.X, PV.W, T0.Z, +; CM-NEXT: ALU clause starting at 103: +; CM-NEXT: CNDE_INT * T0.W, T2.X, T4.X, T5.X, +; CM-NEXT: MUL_IEEE T2.X, PV.W, T0.Z, ; CM-NEXT: SETGT T0.Y, literal.x, KC0[3].X, ; CM-NEXT: ADD_INT T0.Z, T2.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T1.Y, T0.X, T4.X, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT * T0.W, T1.Y, T1.X, T0.X, BS:VEC_120/SCL_212 ; CM-NEXT: -1036817932(-4.485347e+01), 1065353216(1.000000e+00) ; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, ; CM-NEXT: SETGT T1.Y, literal.x, KC0[2].W, @@ -1217,8 +1213,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; R600-LABEL: s_exp10_v3f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 100, @6, KC0[CB0:0-32], KC1[] -; R600-NEXT: ALU 69, @107, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 69, @106, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END @@ -1226,69 +1222,68 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; R600-NEXT: ALU clause starting at 6: ; R600-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x, ; R600-NEXT: -4096(nan), 0(0.000000e+00) -; R600-NEXT: ADD T1.W, KC0[3].Y, -PV.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.W, literal.x, +; R600-NEXT: MUL_IEEE T1.W, PV.W, literal.x, +; R600-NEXT: ADD * T2.W, KC0[3].Y, -PV.W, ; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T3.W, PS, -; R600-NEXT: MUL_IEEE * T4.W, PV.W, literal.x, +; R600-NEXT: RNDNE * T3.W, PV.W, +; R600-NEXT: TRUNC T4.W, PV.W, +; R600-NEXT: MUL_IEEE * T5.W, T2.W, literal.x, ; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PS, -; R600-NEXT: TRUNC * T4.W, PV.W, +; R600-NEXT: MULADD_IEEE T2.W, T2.W, literal.x, PS, +; R600-NEXT: FLT_TO_INT * T4.W, PV.W, ; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: FLT_TO_INT T0.Z, PS, -; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PV.W, -; R600-NEXT: ADD * T1.W, T2.W, -T3.W, -; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: ADD T0.W, PS, PV.W, -; R600-NEXT: MAX_INT * T1.W, PV.Z, literal.x, -; R600-NEXT: -330(nan), 0(0.000000e+00) -; R600-NEXT: ADD_INT T0.Y, PS, literal.x, -; R600-NEXT: ADD_INT T1.Z, T0.Z, literal.y, -; R600-NEXT: SETGT_UINT T1.W, T0.Z, literal.z, -; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: MAX_INT T0.Z, PS, literal.x, +; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.y, PV.W, +; R600-NEXT: ADD * T1.W, T1.W, -T3.W, +; R600-NEXT: -330(nan), 975668412(6.390323e-04) +; R600-NEXT: ADD T0.Y, PS, PV.W, +; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.x, +; R600-NEXT: ADD_INT T0.W, T4.W, literal.y, +; R600-NEXT: SETGT_UINT * T1.W, T4.W, literal.z, ; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) ; R600-NEXT: -229(nan), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT T0.W, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE * T2.W, PS, literal.y, -; R600-NEXT: -127(nan), 209715200(1.972152e-31) -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.x, -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T0.Z, -; R600-NEXT: MIN_INT T3.W, T0.Z, literal.y, -; R600-NEXT: AND_INT * T4.W, KC0[3].W, literal.z, -; R600-NEXT: 209715200(1.972152e-31), 381(5.338947e-43) -; R600-NEXT: -4096(nan), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T1.X, T0.X, literal.x, -; R600-NEXT: ADD T1.Y, KC0[3].W, -PS, -; R600-NEXT: ADD_INT T2.Z, PV.W, literal.y, -; R600-NEXT: ADD_INT T3.W, T0.Z, literal.z, -; R600-NEXT: SETGT_UINT * T5.W, T0.Z, literal.w, -; R600-NEXT: 2130706432(1.701412e+38), -254(nan) +; R600-NEXT: CNDE_INT T0.Z, PS, PV.Z, PV.W, +; R600-NEXT: SETGT_INT T0.W, T4.W, literal.x, +; R600-NEXT: EXP_IEEE * T0.X, PV.Y, +; R600-NEXT: -127(nan), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, PS, literal.x, +; R600-NEXT: CNDE_INT T0.Y, PV.W, PV.Z, T4.W, +; R600-NEXT: MIN_INT T0.Z, T4.W, literal.y, +; R600-NEXT: AND_INT T2.W, KC0[3].W, literal.z, +; R600-NEXT: MUL_IEEE * T3.W, PS, literal.w, +; R600-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; R600-NEXT: -4096(nan), 209715200(1.972152e-31) +; R600-NEXT: MUL_IEEE T2.X, PS, literal.x, +; R600-NEXT: ADD T1.Y, KC0[3].W, -PV.W, +; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, +; R600-NEXT: ADD_INT T5.W, T4.W, literal.z, +; R600-NEXT: SETGT_UINT * T6.W, T4.W, literal.w, +; R600-NEXT: 209715200(1.972152e-31), -254(nan) ; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Z, -; R600-NEXT: SETGT_INT T2.Y, T0.Z, literal.x, +; R600-NEXT: CNDE_INT T3.X, PS, PV.W, PV.Z, +; R600-NEXT: SETGT_INT T2.Y, T4.W, literal.x, ; R600-NEXT: MUL_IEEE T0.Z, PV.Y, literal.y, -; R600-NEXT: MUL_IEEE T3.W, T4.W, literal.z, -; R600-NEXT: MUL_IEEE * T6.W, PV.X, literal.w, +; R600-NEXT: MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212 ; R600-NEXT: 127(1.779649e-43), 975668412(6.390323e-04) -; R600-NEXT: 1079283712(3.321289e+00), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T1.X, T5.W, T1.X, PS, BS:VEC_120/SCL_212 -; R600-NEXT: RNDNE T3.Y, PV.W, -; R600-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z, -; R600-NEXT: CNDE_INT T5.W, PV.Y, T1.Z, PV.X, -; R600-NEXT: CNDE_INT * T1.W, T1.W, T0.Y, T2.W, ; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T0.X, T0.W, PS, T0.X, +; R600-NEXT: CNDE_INT * T1.W, T1.W, T2.X, T3.W, +; R600-NEXT: CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122 +; R600-NEXT: RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212 +; R600-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z, +; R600-NEXT: CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212 +; R600-NEXT: MUL_IEEE * T1.W, T1.X, literal.y, +; R600-NEXT: 1079283712(3.321289e+00), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T1.X, T6.W, T1.X, PS, ; R600-NEXT: LSHL T0.Y, PV.W, literal.x, ; R600-NEXT: AND_INT T1.Z, KC0[3].Z, literal.y, -; R600-NEXT: MULADD_IEEE T0.W, T4.W, literal.z, PV.Z, BS:VEC_120/SCL_212 -; R600-NEXT: ADD * T1.W, T3.W, -PV.Y, +; R600-NEXT: MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212 +; R600-NEXT: ADD * T1.W, T4.W, -PV.Y, ; R600-NEXT: 23(3.222986e-44), -4096(nan) ; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) ; R600-NEXT: ADD T1.Y, PS, PV.W, ; R600-NEXT: MUL_IEEE T0.Z, PV.Z, literal.x, ; R600-NEXT: ADD_INT T0.W, PV.Y, literal.y, -; R600-NEXT: CNDE_INT * T1.W, T2.Y, PV.X, T1.X, +; R600-NEXT: CNDE_INT * T1.W, T2.Y, T0.X, PV.X, ; R600-NEXT: 1079283712(3.321289e+00), 1065353216(1.000000e+00) ; R600-NEXT: MUL_IEEE T0.X, PS, PV.W, ; R600-NEXT: ADD T0.Y, KC0[3].Z, -T1.Z, @@ -1302,12 +1297,12 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; R600-NEXT: MUL_IEEE * T1.W, PS, literal.z, ; R600-NEXT: -1036817932(-4.485347e+01), 975668412(6.390323e-04) ; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T3.X, T1.X, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, PS, literal.y, +; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, T1.X, literal.y, ; R600-NEXT: MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W, ; R600-NEXT: FLT_TO_INT T0.W, PV.Z, ; R600-NEXT: MIN_INT * T2.W, PV.Y, literal.w, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) ; R600-NEXT: 1079283712(3.321289e+00), 381(5.338947e-43) ; R600-NEXT: ADD_INT T4.X, PS, literal.x, ; R600-NEXT: MAX_INT T0.Y, PV.W, literal.y, @@ -1325,7 +1320,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; R600-NEXT: 102(1.429324e-43), -229(nan) ; R600-NEXT: ADD_INT * T6.X, T0.W, literal.x, ; R600-NEXT: -127(nan), 0(0.000000e+00) -; R600-NEXT: ALU clause starting at 107: +; R600-NEXT: ALU clause starting at 106: ; R600-NEXT: SETGT_UINT T0.Y, T0.W, literal.x, ; R600-NEXT: CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221 ; R600-NEXT: SETGT_INT T2.W, T0.W, literal.y, @@ -1341,25 +1336,25 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; R600-NEXT: SETGT_UINT T5.X, T1.Y, literal.x, ; R600-NEXT: CNDE_INT T4.Y, PS, PV.Z, PV.W, ; R600-NEXT: MAX_INT T0.Z, T1.Y, literal.y, -; R600-NEXT: MUL_IEEE T4.W, T1.Z, literal.z, -; R600-NEXT: MUL_IEEE * T5.W, PV.Y, literal.w, +; R600-NEXT: MUL_IEEE T4.W, PV.Y, literal.z, +; R600-NEXT: MUL_IEEE * T5.W, T1.Z, literal.w, ; R600-NEXT: 254(3.559298e-43), -330(nan) -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) -; R600-NEXT: CNDE_INT T6.X, T3.W, PS, T3.Y, BS:VEC_021/SCL_122 -; R600-NEXT: MUL_IEEE T3.Y, PV.W, literal.x, +; R600-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) +; R600-NEXT: MUL_IEEE T6.X, PS, literal.x, +; R600-NEXT: CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122 ; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, ; R600-NEXT: ADD_INT T3.W, T1.Y, literal.z, -; R600-NEXT: SETGT_UINT * T5.W, T1.Y, literal.w, +; R600-NEXT: SETGT_UINT * T4.W, T1.Y, literal.w, ; R600-NEXT: 2130706432(1.701412e+38), 204(2.858649e-43) ; R600-NEXT: 102(1.429324e-43), -229(nan) ; R600-NEXT: CNDE_INT T8.X, PS, PV.Z, PV.W, ; R600-NEXT: SETGT_INT T5.Y, T1.Y, literal.x, -; R600-NEXT: CNDE_INT T0.Z, T0.Y, T4.W, PV.Y, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T2.W, T2.W, PV.X, T1.Z, +; R600-NEXT: CNDE_INT T0.Z, T2.W, PV.Y, T1.Z, +; R600-NEXT: CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212 ; R600-NEXT: LSHL * T3.W, T4.Y, literal.y, ; R600-NEXT: -127(nan), 23(3.222986e-44) ; R600-NEXT: ADD_INT T6.X, PS, literal.x, -; R600-NEXT: CNDE_INT T0.Y, T0.W, PV.W, PV.Z, +; R600-NEXT: CNDE_INT T0.Y, T0.W, PV.Z, PV.W, ; R600-NEXT: CNDE_INT T0.Z, PV.Y, PV.X, T1.Y, ; R600-NEXT: CNDE_INT T0.W, T5.X, T7.X, T4.X, ; R600-NEXT: SETGT_INT * T2.W, T1.Y, literal.y, @@ -1367,18 +1362,18 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; R600-NEXT: CNDE_INT T4.X, PS, PV.Z, PV.W, ; R600-NEXT: MUL_IEEE T0.Y, PV.Y, PV.X, ; R600-NEXT: SETGT T0.Z, literal.x, KC0[3].Z, -; R600-NEXT: CNDE_INT T0.W, T5.W, T2.Y, T1.W, -; R600-NEXT: MUL_IEEE * T1.W, T3.X, literal.y, +; R600-NEXT: MUL_IEEE T0.W, T2.Y, literal.y, +; R600-NEXT: CNDE_INT * T1.W, T4.W, T3.X, T1.W, ; R600-NEXT: -1036817932(-4.485347e+01), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T3.X, T5.X, T3.X, PS, -; R600-NEXT: CNDE_INT T1.Y, T5.Y, PV.W, T1.X, +; R600-NEXT: CNDE_INT T1.X, T5.Y, PS, T1.X, +; R600-NEXT: CNDE_INT T1.Y, T5.X, T2.Y, PV.W, ; R600-NEXT: CNDE T0.Z, PV.Z, PV.Y, 0.0, ; R600-NEXT: SETGT T0.W, KC0[3].Z, literal.x, ; R600-NEXT: LSHL * T1.W, PV.X, literal.y, ; R600-NEXT: 1109008539(3.853184e+01), 23(3.222986e-44) -; R600-NEXT: ADD_INT T1.X, PS, literal.x, +; R600-NEXT: ADD_INT T3.X, PS, literal.x, ; R600-NEXT: CNDE T0.Y, PV.W, PV.Z, literal.y, -; R600-NEXT: CNDE_INT T0.Z, T2.W, PV.Y, PV.X, +; R600-NEXT: CNDE_INT T0.Z, T2.W, PV.X, PV.Y, ; R600-NEXT: CNDE T0.W, T2.X, T0.X, 0.0, ; R600-NEXT: SETGT * T1.W, KC0[3].Y, literal.z, ; R600-NEXT: 1065353216(1.000000e+00), 2139095040(INF) @@ -1399,197 +1394,193 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; CM-LABEL: s_exp10_v3f32: ; CM: ; %bb.0: -; CM-NEXT: ALU 102, @6, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 80, @109, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X +; CM-NEXT: ALU 101, @6, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 77, @108, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: ALU clause starting at 6: ; CM-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x, ; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, ; CM-NEXT: ADD * T1.W, KC0[3].Y, -PV.W, -; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: RNDNE * T2.W, PV.Z, -; CM-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; CM-NEXT: TRUNC T2.Z, PV.W, +; CM-NEXT: MUL_IEEE T0.Z, PV.W, literal.x, +; CM-NEXT: MUL_IEEE * T2.W, T0.W, literal.y, +; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; CM-NEXT: RNDNE T1.Z, PV.W, ; CM-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z, ; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; CM-NEXT: MULADD_IEEE T0.Y, T0.W, literal.x, PV.W, -; CM-NEXT: ADD T0.Z, T0.Z, -T2.W, -; CM-NEXT: FLT_TO_INT * T0.W, PV.Z, +; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.x, PV.W, +; CM-NEXT: ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212 ; CM-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; CM-NEXT: MIN_INT T1.Z, PV.W, literal.x, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, +; CM-NEXT: TRUNC T1.Z, T1.Z, +; CM-NEXT: ADD * T0.W, PV.W, PV.Z, +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: FLT_TO_INT T0.Z, T1.Z, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.Y, PV.W, literal.x, +; CM-NEXT: MAX_INT T1.Z, PV.Z, literal.y, +; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) ; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; CM-NEXT: EXP_IEEE T0.X, T1.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: MUL_IEEE T0.Y, PV.X, literal.x, -; CM-NEXT: ADD_INT T0.Z, T1.Z, literal.y, -; CM-NEXT: MAX_INT * T1.W, T0.W, literal.z, -; CM-NEXT: 2130706432(1.701412e+38), -254(nan) -; CM-NEXT: -330(nan), 0(0.000000e+00) -; CM-NEXT: ADD_INT T1.X, T0.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T1.Z, T0.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, T0.W, literal.w, -; CM-NEXT: -127(nan), 204(2.858649e-43) +; CM-NEXT: ADD_INT T1.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T1.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T1.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T1.W, T0.Z, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) ; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: SETGT_UINT T2.X, T0.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T1.Z, T0.W, literal.y, -; CM-NEXT: MUL_IEEE * T2.W, T0.X, literal.z, -; CM-NEXT: 254(3.559298e-43), -127(nan) -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.Z, PV.Y, T0.W, -; CM-NEXT: CNDE_INT T0.Z, PV.X, T1.X, T0.Z, -; CM-NEXT: SETGT_INT * T0.W, T0.W, literal.y, -; CM-NEXT: 209715200(1.972152e-31), 127(1.779649e-43) +; CM-NEXT: ADD_INT T2.X, T0.Z, literal.x, +; CM-NEXT: SETGT_UINT T2.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: MUL_IEEE T3.X, T0.X, literal.x, +; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, T0.Z, +; CM-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, T1.X, +; CM-NEXT: SETGT_INT * T3.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 127(1.779649e-43) ; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: CNDE_INT T0.Z, T1.W, PV.X, T2.W, -; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.x, +; CM-NEXT: CNDE_INT * T0.W, T1.W, T0.Y, T0.W, ; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T1.X, T2.X, T0.Y, PV.W, -; CM-NEXT: CNDE_INT T0.Y, T1.Z, PV.Z, T0.X, +; CM-NEXT: CNDE_INT T0.X, T2.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T0.Y, T2.Y, T3.X, PV.Z, ; CM-NEXT: LSHL T0.Z, PV.Y, literal.x, -; CM-NEXT: AND_INT * T1.W, KC0[3].Z, literal.y, +; CM-NEXT: AND_INT * T0.W, KC0[3].Z, literal.y, ; CM-NEXT: 23(3.222986e-44), -4096(nan) -; CM-NEXT: MUL_IEEE T0.X, PV.W, literal.x, ; CM-NEXT: ADD T1.Y, KC0[3].Z, -PV.W, -; CM-NEXT: ADD_INT T0.Z, PV.Z, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T0.W, PV.Y, PV.X, -; CM-NEXT: 1079283712(3.321289e+00), 1065353216(1.000000e+00) -; CM-NEXT: MUL_IEEE T0.Y, PV.W, PV.Z, -; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, -; CM-NEXT: RNDNE * T0.W, PV.X, -; CM-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) +; CM-NEXT: ADD_INT T0.Z, PV.Z, literal.x, +; CM-NEXT: CNDE_INT * T1.W, T3.W, PV.X, PV.Y, +; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, +; CM-NEXT: MUL_IEEE T0.Y, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, T0.W, literal.y, +; CM-NEXT: AND_INT * T1.W, KC0[3].W, literal.z, +; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; CM-NEXT: -4096(nan), 0(0.000000e+00) ; CM-NEXT: SETGT T1.X, literal.x, KC0[3].Y, -; CM-NEXT: TRUNC T2.Y, PV.W, -; CM-NEXT: AND_INT T1.Z, KC0[3].W, literal.y, -; CM-NEXT: MULADD_IEEE * T2.W, T1.Y, literal.z, PV.Z, -; CM-NEXT: -1036817932(-4.485347e+01), -4096(nan) -; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; CM-NEXT: MULADD_IEEE T2.X, T1.W, literal.x, PV.W, -; CM-NEXT: MUL_IEEE T1.Y, PV.Z, literal.y, -; CM-NEXT: FLT_TO_INT T0.Z, PV.Y, -; CM-NEXT: ADD * T1.W, KC0[3].W, -PV.Z, +; CM-NEXT: ADD T2.Y, KC0[3].W, -PV.W, +; CM-NEXT: RNDNE T1.Z, PV.Z, +; CM-NEXT: MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y, +; CM-NEXT: -1036817932(-4.485347e+01), 1079283712(3.321289e+00) +; CM-NEXT: MULADD_IEEE T2.X, T0.W, literal.x, PV.W, +; CM-NEXT: ADD T0.Y, T0.Z, -PV.Z, +; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212 ; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) -; CM-NEXT: ADD T0.X, T0.X, -T0.W, -; CM-NEXT: MUL_IEEE T2.Y, PV.W, literal.x, -; CM-NEXT: MAX_INT T2.Z, PV.Z, literal.y, -; CM-NEXT: RNDNE * T0.W, PV.Y, -; CM-NEXT: 975668412(6.390323e-04), -330(nan) -; CM-NEXT: TRUNC T3.X, PV.W, -; CM-NEXT: ADD_INT T3.Y, PV.Z, literal.x, -; CM-NEXT: MULADD_IEEE T2.Z, T1.W, literal.y, PV.Y, -; CM-NEXT: ADD * T1.W, PV.X, T2.X, -; CM-NEXT: 204(2.858649e-43), 1079283712(3.321289e+00) -; CM-NEXT: EXP_IEEE T0.X, T1.W, -; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: ADD_INT T2.X, T0.Z, literal.x, -; CM-NEXT: MULADD_IEEE T2.Y, T1.Z, literal.y, T2.Z, BS:VEC_102/SCL_221 -; CM-NEXT: ADD T1.Z, T1.Y, -T0.W, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.z, -; CM-NEXT: 102(1.429324e-43), 975668412(6.390323e-04) -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_UINT T4.X, T0.Z, literal.x, -; CM-NEXT: MUL_IEEE T1.Y, PV.W, literal.y, -; CM-NEXT: SETGT_UINT T2.Z, T0.Z, literal.z, -; CM-NEXT: ADD * T1.W, PV.Z, PV.Y, -; CM-NEXT: -229(nan), 2130706432(1.701412e+38) -; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: TRUNC T3.X, T1.Z, +; CM-NEXT: RNDNE T1.Y, PV.W, +; CM-NEXT: MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z, +; CM-NEXT: ADD * T2.W, PV.Y, PV.X, +; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X (MASKED), T2.W, +; CM-NEXT: EXP_IEEE T0.Y, T2.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T2.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T2.W, +; CM-NEXT: MULADD_IEEE T2.X, T1.W, literal.x, T0.Z, +; CM-NEXT: ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212 +; CM-NEXT: FLT_TO_INT T0.Z, T3.X, +; CM-NEXT: MUL_IEEE * T0.W, PV.Y, literal.y, +; CM-NEXT: 975668412(6.390323e-04), 209715200(1.972152e-31) +; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, +; CM-NEXT: SETGT_UINT T3.Y, PV.Z, literal.y, +; CM-NEXT: TRUNC T1.Z, T1.Y, +; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) ; CM-NEXT: EXP_IEEE T1.X (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T1.W, -; CM-NEXT: EXP_IEEE T1.Z, T1.W, +; CM-NEXT: EXP_IEEE T1.Y, T1.W, +; CM-NEXT: EXP_IEEE T1.Z (MASKED), T1.W, ; CM-NEXT: EXP_IEEE * T1.W (MASKED), T1.W, -; CM-NEXT: ALU clause starting at 109: -; CM-NEXT: CNDE_INT T5.X, T2.Z, T0.W, T1.Y, -; CM-NEXT: CNDE_INT T1.Y, T4.X, T3.Y, T2.X, -; CM-NEXT: FLT_TO_INT T3.Z, T3.X, BS:VEC_120/SCL_212 -; CM-NEXT: MUL_IEEE * T0.W, T1.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_INT T2.X, T0.Z, literal.x, -; CM-NEXT: MUL_IEEE T2.Y, T0.X, literal.y, -; CM-NEXT: MUL_IEEE T4.Z, PV.W, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, PV.Z, literal.w, -; CM-NEXT: -127(nan), 209715200(1.972152e-31) -; CM-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T3.X, PV.W, T0.W, PV.Z, -; CM-NEXT: MUL_IEEE T3.Y, PV.Y, literal.x, -; CM-NEXT: CNDE_INT T4.Z, PV.X, T1.Y, T0.Z, -; CM-NEXT: MAX_INT * T0.W, T3.Z, literal.y, -; CM-NEXT: 209715200(1.972152e-31), -330(nan) -; CM-NEXT: ADD_INT T6.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T1.Y, T3.Z, literal.y, -; CM-NEXT: SETGT_UINT T5.Z, T3.Z, literal.z, -; CM-NEXT: MUL_IEEE * T0.W, T1.Z, literal.w, BS:VEC_120/SCL_212 +; CM-NEXT: FLT_TO_INT T2.X, T1.Z, +; CM-NEXT: MUL_IEEE T2.Y, PV.Y, literal.x, +; CM-NEXT: CNDE_INT T1.Z, T3.Y, T3.X, T0.W, +; CM-NEXT: SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 209715200(1.972152e-31), -127(nan) +; CM-NEXT: CNDE_INT T3.X, PV.W, PV.Z, T0.Y, +; CM-NEXT: MUL_IEEE * T4.Y, PV.Y, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: ALU clause starting at 108: +; CM-NEXT: SETGT_UINT T1.Z, T2.X, literal.x, +; CM-NEXT: MAX_INT * T1.W, T0.Z, literal.y, +; CM-NEXT: -229(nan), -330(nan) +; CM-NEXT: ADD_INT T4.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T5.Y, T0.Z, literal.y, +; CM-NEXT: CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y, +; CM-NEXT: SETGT_INT * T1.W, T2.X, literal.z, ; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) -; CM-NEXT: -229(nan), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T7.X, PV.W, literal.x, -; CM-NEXT: MIN_INT T4.Y, T3.Z, literal.y, -; CM-NEXT: CNDE_INT T6.Z, PV.Z, PV.X, PV.Y, -; CM-NEXT: SETGT_INT * T2.W, T3.Z, literal.z, -; CM-NEXT: 209715200(1.972152e-31), 381(5.338947e-43) ; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T6.X, PV.W, PV.Z, T3.Z, -; CM-NEXT: MIN_INT T1.Y, T0.Z, literal.x, -; CM-NEXT: ADD_INT T6.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T3.W, T3.Z, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 381(5.338947e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T8.X, T1.W, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T4.Y, T3.Z, literal.x, -; CM-NEXT: ADD_INT T3.Z, PV.Y, literal.y, -; CM-NEXT: ADD_INT * T1.W, T0.Z, literal.z, BS:VEC_120/SCL_212 +; CM-NEXT: CNDE_INT T5.X, PV.W, PV.Z, T1.Y, +; CM-NEXT: MUL_IEEE T0.Y, T0.Y, literal.x, +; CM-NEXT: MAX_INT T2.Z, T2.X, literal.y, +; CM-NEXT: CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212 +; CM-NEXT: 2130706432(1.701412e+38), -330(nan) +; CM-NEXT: CNDE_INT T4.X, T0.W, PV.W, T0.Z, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.x, +; CM-NEXT: ADD_INT T2.Z, T2.X, literal.y, +; CM-NEXT: MIN_INT * T0.W, T2.X, literal.z, +; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) +; CM-NEXT: ADD_INT T6.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T3.Y, T2.X, literal.y, +; CM-NEXT: SETGT_UINT T3.Z, T2.X, literal.z, +; CM-NEXT: CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z, +; CM-NEXT: -254(nan), -127(nan) +; CM-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T7.X, T1.Y, literal.x, +; CM-NEXT: CNDE_INT T1.Y, T1.W, PV.W, T2.X, +; CM-NEXT: CNDE_INT T1.Z, PV.Z, PV.Y, PV.X, +; CM-NEXT: MIN_INT * T0.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; CM-NEXT: SETGT_INT T2.X, T2.X, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.W, literal.y, +; CM-NEXT: ADD_INT T2.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T0.W, T0.Z, literal.w, ; CM-NEXT: 127(1.779649e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T9.X, T2.Z, PV.W, PV.Z, -; CM-NEXT: SETGT_INT T1.Y, T0.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T6.X, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T5.Z, T7.X, T0.W, BS:VEC_201 -; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T6.X, T2.W, PV.W, T1.Z, -; CM-NEXT: LSHL T5.Y, PV.Z, literal.x, -; CM-NEXT: CNDE_INT T0.Z, PV.Y, T4.Z, PV.X, -; CM-NEXT: CNDE_INT * T0.W, T4.X, T3.Y, T2.Y, -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.X, T2.X, PV.W, T0.X, -; CM-NEXT: LSHL T2.Y, PV.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: CNDE_INT T6.X, PV.W, PV.Z, PV.Y, +; CM-NEXT: SETGT_INT T2.Y, T0.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, PV.X, T1.Y, T1.Z, +; CM-NEXT: MUL_IEEE * T1.W, T7.X, literal.y, +; CM-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T7.X, T3.Z, T7.X, PV.W, +; CM-NEXT: LSHL T1.Y, PV.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122 +; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.y, +; CM-NEXT: 23(3.222986e-44), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T4.X, T0.W, T0.Y, PV.W, +; CM-NEXT: LSHL T0.Y, PV.Z, literal.x, ; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T4.Y, PV.X, T3.X, BS:VEC_021/SCL_122 +; CM-NEXT: CNDE_INT * T0.W, T2.X, T5.X, PV.X, ; CM-NEXT: 23(3.222986e-44), 1065353216(1.000000e+00) ; CM-NEXT: MUL_IEEE T2.X, PV.W, PV.Z, -; CM-NEXT: SETGT T3.Y, literal.x, KC0[3].W, +; CM-NEXT: SETGT T1.Y, literal.x, KC0[3].W, ; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T1.Y, PV.X, T5.X, +; CM-NEXT: CNDE_INT * T0.W, T2.Y, T3.X, PV.X, ; CM-NEXT: -1036817932(-4.485347e+01), 1065353216(1.000000e+00) -; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, -; CM-NEXT: SETGT T1.Y, literal.x, KC0[3].Z, +; CM-NEXT: MUL_IEEE T3.X, PV.W, PV.Z, +; CM-NEXT: SETGT T0.Y, literal.x, KC0[3].Z, ; CM-NEXT: CNDE T0.Z, PV.Y, PV.X, 0.0, ; CM-NEXT: SETGT * T0.W, KC0[3].W, literal.y, ; CM-NEXT: -1036817932(-4.485347e+01), 1109008539(3.853184e+01) ; CM-NEXT: CNDE T2.X, PV.W, PV.Z, literal.x, -; CM-NEXT: CNDE T1.Y, PV.Y, PV.X, 0.0, +; CM-NEXT: CNDE T0.Y, PV.Y, PV.X, 0.0, ; CM-NEXT: SETGT T0.Z, KC0[3].Z, literal.y, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; CM-NEXT: 2139095040(INF), 1109008539(3.853184e+01) ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T0.X, PV.W, literal.x, -; CM-NEXT: CNDE T1.Y, PV.Z, PV.Y, literal.y, -; CM-NEXT: CNDE T0.Z, T1.X, T0.Y, 0.0, +; CM-NEXT: LSHR T3.X, PV.W, literal.x, +; CM-NEXT: CNDE T0.Y, PV.Z, PV.Y, literal.y, +; CM-NEXT: CNDE T0.Z, T1.X, T0.X, 0.0, ; CM-NEXT: SETGT * T0.W, KC0[3].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 2139095040(INF) ; CM-NEXT: 1109008539(3.853184e+01), 0(0.000000e+00) -; CM-NEXT: CNDE * T1.X, PV.W, PV.Z, literal.x, +; CM-NEXT: CNDE * T0.X, PV.W, PV.Z, literal.x, ; CM-NEXT: 2139095040(INF), 0(0.000000e+00) -; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <3 x float> @llvm.exp10.v3f32(<3 x float> %in) store <3 x float> %result, ptr addrspace(1) %out @@ -2052,227 +2043,224 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; R600-LABEL: s_exp10_v4f32: ; R600: ; %bb.0: ; R600-NEXT: ALU 98, @6, KC0[CB0:0-32], KC1[] -; R600-NEXT: ALU 98, @105, KC0[CB0:0-32], KC1[] -; R600-NEXT: ALU 24, @204, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 95, @105, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 24, @201, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 6: ; R600-NEXT: AND_INT * T0.W, KC0[3].Z, literal.x, ; R600-NEXT: -4096(nan), 0(0.000000e+00) -; R600-NEXT: ADD T1.W, KC0[3].Z, -PV.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.W, literal.x, +; R600-NEXT: ADD * T1.W, KC0[3].Z, -PV.W, +; R600-NEXT: MUL_IEEE T2.W, PV.W, literal.x, +; R600-NEXT: MUL_IEEE * T3.W, T0.W, literal.y, +; R600-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; R600-NEXT: RNDNE T4.W, PS, +; R600-NEXT: MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122 ; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T3.W, PS, -; R600-NEXT: MUL_IEEE * T4.W, PV.W, literal.x, +; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PS, +; R600-NEXT: ADD * T1.W, T3.W, -PV.W, ; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PS, -; R600-NEXT: TRUNC * T4.W, PV.W, -; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: FLT_TO_INT T0.Z, PS, -; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.x, PV.W, -; R600-NEXT: ADD * T1.W, T2.W, -T3.W, -; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: ADD T1.Z, PS, PV.W, -; R600-NEXT: MAX_INT T0.W, PV.Z, literal.x, -; R600-NEXT: MIN_INT * T1.W, PV.Z, literal.y, -; R600-NEXT: -330(nan), 381(5.338947e-43) -; R600-NEXT: ADD_INT T0.X, PS, literal.x, -; R600-NEXT: ADD_INT T0.Y, PV.W, literal.y, -; R600-NEXT: ADD_INT T2.Z, T0.Z, literal.z, -; R600-NEXT: SETGT_UINT T0.W, T0.Z, literal.w, -; R600-NEXT: EXP_IEEE * T1.X, PV.Z, -; R600-NEXT: -254(nan), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: ADD_INT T2.X, T0.Z, literal.x, -; R600-NEXT: SETGT_UINT T1.Y, T0.Z, literal.y, -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT T1.W, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE * T2.W, PS, literal.z, -; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T3.X, T1.X, literal.x, -; R600-NEXT: MUL_IEEE T0.Y, PS, literal.y, -; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T0.Z, -; R600-NEXT: CNDE_INT T3.W, PV.Y, PV.X, T0.X, -; R600-NEXT: SETGT_INT * T4.W, T0.Z, literal.z, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) -; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; R600-NEXT: AND_INT T2.Y, KC0[4].X, literal.x, -; R600-NEXT: CNDE_INT T0.Z, PS, PV.Z, PV.W, -; R600-NEXT: CNDE_INT T0.W, T0.W, PV.Y, T2.W, -; R600-NEXT: MUL_IEEE * T2.W, PV.X, literal.y, -; R600-NEXT: -4096(nan), 2130706432(1.701412e+38) -; R600-NEXT: CNDE_INT T0.X, T1.Y, T3.X, PS, -; R600-NEXT: CNDE_INT T0.Y, T1.W, PV.W, T1.X, -; R600-NEXT: LSHL T0.Z, PV.Z, literal.x, -; R600-NEXT: ADD T0.W, KC0[4].X, -PV.Y, -; R600-NEXT: MUL_IEEE * T1.W, PV.Y, literal.y, -; R600-NEXT: 23(3.222986e-44), 1079283712(3.321289e+00) -; R600-NEXT: RNDNE T1.Y, PS, -; R600-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; R600-NEXT: ADD_INT T2.W, PV.Z, literal.y, -; R600-NEXT: CNDE_INT * T3.W, T4.W, PV.Y, PV.X, -; R600-NEXT: 975668412(6.390323e-04), 1065353216(1.000000e+00) -; R600-NEXT: MUL_IEEE T0.Y, PS, PV.W, -; R600-NEXT: AND_INT T0.Z, KC0[3].W, literal.x, -; R600-NEXT: MULADD_IEEE T0.W, T0.W, literal.y, PV.Z, -; R600-NEXT: TRUNC * T2.W, PV.Y, -; R600-NEXT: -4096(nan), 1079283712(3.321289e+00) -; R600-NEXT: SETGT T0.X, literal.x, KC0[3].Z, -; R600-NEXT: FLT_TO_INT T3.Y, PS, -; R600-NEXT: MULADD_IEEE T1.Z, T2.Y, literal.y, PV.W, -; R600-NEXT: ADD T0.W, T1.W, -T1.Y, -; R600-NEXT: MUL_IEEE * T1.W, PV.Z, literal.z, -; R600-NEXT: -1036817932(-4.485347e+01), 975668412(6.390323e-04) -; R600-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) -; R600-NEXT: RNDNE T1.X, PS, -; R600-NEXT: AND_INT T1.Y, KC0[3].Y, literal.x, -; R600-NEXT: ADD T1.Z, PV.W, PV.Z, -; R600-NEXT: MAX_INT T0.W, PV.Y, literal.y, -; R600-NEXT: MIN_INT * T2.W, PV.Y, literal.z, -; R600-NEXT: -4096(nan), -330(nan) +; R600-NEXT: ADD T0.W, PS, PV.W, +; R600-NEXT: TRUNC * T1.W, T4.W, +; R600-NEXT: FLT_TO_INT T1.W, PS, +; R600-NEXT: EXP_IEEE * T0.X, PV.W, +; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, +; R600-NEXT: MAX_INT T0.W, PV.W, literal.y, +; R600-NEXT: MIN_INT * T2.W, PV.W, literal.z, +; R600-NEXT: 209715200(1.972152e-31), -330(nan) ; R600-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; R600-NEXT: ADD_INT T2.X, PS, literal.x, -; R600-NEXT: ADD_INT T2.Y, PV.W, literal.y, -; R600-NEXT: ADD_INT T2.Z, T3.Y, literal.z, -; R600-NEXT: SETGT_UINT T0.W, T3.Y, literal.w, -; R600-NEXT: EXP_IEEE * T1.Z, PV.Z, -; R600-NEXT: -254(nan), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -229(nan) -; R600-NEXT: ADD_INT T3.X, T3.Y, literal.x, -; R600-NEXT: SETGT_UINT T4.Y, T3.Y, literal.y, -; R600-NEXT: CNDE_INT T2.Z, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT T2.W, T3.Y, literal.x, -; R600-NEXT: MUL_IEEE * T3.W, PS, literal.z, +; R600-NEXT: ADD_INT T1.X, PS, literal.x, +; R600-NEXT: AND_INT T0.Y, KC0[4].X, literal.y, +; R600-NEXT: ADD_INT T1.Z, PV.W, literal.z, +; R600-NEXT: ADD_INT * T0.W, T1.W, literal.w, +; R600-NEXT: -254(nan), -4096(nan) +; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; R600-NEXT: SETGT_UINT * T2.W, T1.W, literal.x, +; R600-NEXT: -229(nan), 0(0.000000e+00) +; R600-NEXT: ADD_INT T2.X, T1.W, literal.x, +; R600-NEXT: SETGT_UINT T1.Y, T1.W, literal.y, +; R600-NEXT: CNDE_INT T1.Z, PV.W, T1.Z, T0.W, +; R600-NEXT: SETGT_INT T0.W, T1.W, literal.x, +; R600-NEXT: ADD * T3.W, KC0[4].X, -T0.Y, ; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T4.X, T1.Z, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, PS, literal.y, -; R600-NEXT: CNDE_INT T2.Z, PV.W, PV.Z, T3.Y, -; R600-NEXT: CNDE_INT T4.W, PV.Y, PV.X, T2.X, -; R600-NEXT: SETGT_INT * T5.W, T3.Y, literal.z, -; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, T0.Y, literal.y, +; R600-NEXT: CNDE_INT T1.Z, PV.W, PV.Z, T1.W, +; R600-NEXT: CNDE_INT T4.W, PV.Y, PV.X, T1.X, +; R600-NEXT: SETGT_INT * T1.W, T1.W, literal.z, +; R600-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) ; R600-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; R600-NEXT: ADD T2.X, KC0[3].W, -T0.Z, -; R600-NEXT: CNDE_INT T3.Y, PS, PV.Z, PV.W, -; R600-NEXT: CNDE_INT * T2.Z, T0.W, PV.Y, T3.W, -; R600-NEXT: ALU clause starting at 105: -; R600-NEXT: MUL_IEEE T0.W, T4.X, literal.x, -; R600-NEXT: ADD * T3.W, KC0[3].Y, -T1.Y, +; R600-NEXT: CNDE_INT T1.X, PS, PV.Z, PV.W, +; R600-NEXT: RNDNE T3.Y, PV.Y, +; R600-NEXT: MULADD_IEEE T1.Z, T3.W, literal.x, PV.X, +; R600-NEXT: MUL_IEEE T3.W, T0.Z, literal.y, +; R600-NEXT: MUL_IEEE * T4.W, T0.X, literal.z, +; R600-NEXT: 1079283712(3.321289e+00), 209715200(1.972152e-31) ; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T2.X, PS, literal.x, +; R600-NEXT: CNDE_INT T4.Y, T2.W, PV.W, T0.Z, +; R600-NEXT: MULADD_IEEE T0.Z, T0.Y, literal.y, PV.Z, +; R600-NEXT: ADD T2.W, T2.Y, -PV.Y, BS:VEC_120/SCL_212 +; R600-NEXT: AND_INT * T3.W, KC0[3].Y, literal.z, +; R600-NEXT: 2130706432(1.701412e+38), 975668412(6.390323e-04) +; R600-NEXT: -4096(nan), 0(0.000000e+00) ; R600-NEXT: MUL_IEEE T3.X, PS, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, T1.Y, literal.y, -; R600-NEXT: CNDE_INT T3.Z, T4.Y, T4.X, PV.W, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T0.W, T2.W, T2.Z, T1.Z, -; R600-NEXT: LSHL * T2.W, T3.Y, literal.z, -; R600-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) -; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; R600-NEXT: ADD_INT T4.X, PS, literal.x, -; R600-NEXT: CNDE_INT T3.Y, T5.W, PV.W, PV.Z, -; R600-NEXT: RNDNE T1.Z, PV.Y, -; R600-NEXT: MULADD_IEEE T0.W, T3.W, literal.y, PV.X, BS:VEC_120/SCL_212 -; R600-NEXT: MUL_IEEE * T2.W, T2.X, literal.z, +; R600-NEXT: ADD T0.Y, PV.W, PV.Z, +; R600-NEXT: CNDE_INT T0.Z, T0.W, PV.Y, T0.X, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T0.W, T1.Y, T4.W, PV.X, +; R600-NEXT: LSHL * T2.W, T1.X, literal.y, +; R600-NEXT: 1079283712(3.321289e+00), 23(3.222986e-44) +; R600-NEXT: AND_INT T0.X, KC0[3].W, literal.x, +; R600-NEXT: TRUNC T1.Y, T3.Y, +; R600-NEXT: ADD_INT T1.Z, PS, literal.y, +; R600-NEXT: CNDE_INT T0.W, T1.W, PV.Z, PV.W, +; R600-NEXT: EXP_IEEE * T0.Y, PV.Y, +; R600-NEXT: -4096(nan), 1065353216(1.000000e+00) +; R600-NEXT: MUL_IEEE T1.X, PV.W, PV.Z, +; R600-NEXT: FLT_TO_INT T1.Y, PV.Y, +; R600-NEXT: MUL_IEEE T0.Z, PS, literal.x, +; R600-NEXT: ADD T0.W, KC0[3].W, -PV.X, +; R600-NEXT: RNDNE * T1.W, T3.X, +; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; R600-NEXT: SETGT T2.X, literal.x, KC0[3].Z, +; R600-NEXT: TRUNC T2.Y, PS, +; R600-NEXT: MUL_IEEE T1.Z, PV.W, literal.y, +; R600-NEXT: MUL_IEEE T2.W, PV.Z, literal.z, +; R600-NEXT: MAX_INT * T4.W, PV.Y, literal.w, +; R600-NEXT: -1036817932(-4.485347e+01), 975668412(6.390323e-04) +; R600-NEXT: 209715200(1.972152e-31), -330(nan) +; R600-NEXT: ADD T4.X, KC0[3].Y, -T3.W, +; R600-NEXT: ADD_INT T3.Y, PS, literal.x, +; R600-NEXT: ADD_INT T2.Z, T1.Y, literal.y, +; R600-NEXT: SETGT_UINT T4.W, T1.Y, literal.z, +; R600-NEXT: MIN_INT * T5.W, T1.Y, literal.w, +; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; R600-NEXT: -229(nan), 381(5.338947e-43) +; R600-NEXT: ADD_INT T5.X, PS, literal.x, +; R600-NEXT: ADD_INT T4.Y, T1.Y, literal.y, +; R600-NEXT: SETGT_UINT T3.Z, T1.Y, literal.z, +; R600-NEXT: CNDE_INT T5.W, PV.W, PV.Y, PV.Z, +; R600-NEXT: SETGT_INT * T6.W, T1.Y, literal.y, +; R600-NEXT: -254(nan), -127(nan) +; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T6.X, T0.Y, literal.x, +; R600-NEXT: CNDE_INT T3.Y, PS, PV.W, T1.Y, +; R600-NEXT: CNDE_INT * T2.Z, PV.Z, PV.Y, PV.X, +; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; R600-NEXT: ALU clause starting at 105: +; R600-NEXT: SETGT_INT T5.W, T1.Y, literal.x, +; R600-NEXT: MUL_IEEE * T7.W, T4.X, literal.y, +; R600-NEXT: 127(1.779649e-43), 975668412(6.390323e-04) +; R600-NEXT: MUL_IEEE T5.X, T0.X, literal.x, +; R600-NEXT: MULADD_IEEE T1.Y, T4.X, literal.x, PS, BS:VEC_120/SCL_212 +; R600-NEXT: CNDE_INT T2.Z, PV.W, T3.Y, T2.Z, +; R600-NEXT: MUL_IEEE T7.W, T6.X, literal.y, BS:VEC_201 +; R600-NEXT: CNDE_INT * T2.W, T4.W, T2.W, T0.Z, +; R600-NEXT: 1079283712(3.321289e+00), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T4.X, T6.W, PS, T0.Y, +; R600-NEXT: CNDE_INT T0.Y, T3.Z, T6.X, PV.W, +; R600-NEXT: LSHL T0.Z, PV.Z, literal.x, +; R600-NEXT: MULADD_IEEE T2.W, T3.W, literal.y, PV.Y, BS:VEC_201 +; R600-NEXT: ADD * T1.W, T3.X, -T1.W, +; R600-NEXT: 23(3.222986e-44), 975668412(6.390323e-04) +; R600-NEXT: ADD T3.X, PS, PV.W, +; R600-NEXT: ADD_INT T1.Y, PV.Z, literal.x, +; R600-NEXT: CNDE_INT T0.Z, T5.W, PV.X, PV.Y, +; R600-NEXT: RNDNE T1.W, T5.X, +; R600-NEXT: MULADD_IEEE * T0.W, T0.W, literal.y, T1.Z, BS:VEC_021/SCL_122 ; R600-NEXT: 1065353216(1.000000e+00), 1079283712(3.321289e+00) -; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: MULADD_IEEE T2.X, T2.X, literal.x, PS, -; R600-NEXT: MULADD_IEEE T1.Y, T1.Y, literal.y, PV.W, -; R600-NEXT: ADD T2.Z, T2.Y, -PV.Z, BS:VEC_120/SCL_212 -; R600-NEXT: MUL_IEEE T0.W, PV.Y, PV.X, -; R600-NEXT: SETGT * T2.W, literal.z, KC0[4].X, -; R600-NEXT: 1079283712(3.321289e+00), 975668412(6.390323e-04) -; R600-NEXT: -1036817932(-4.485347e+01), 0(0.000000e+00) -; R600-NEXT: CNDE T3.X, PS, PV.W, 0.0, -; R600-NEXT: ADD T1.Y, PV.Z, PV.Y, -; R600-NEXT: TRUNC T1.Z, T1.Z, -; R600-NEXT: MULADD_IEEE T0.W, T0.Z, literal.x, PV.X, BS:VEC_120/SCL_212 -; R600-NEXT: ADD * T1.W, T1.W, -T1.X, -; R600-NEXT: 975668412(6.390323e-04), 0(0.000000e+00) -; R600-NEXT: SETGT T2.X, KC0[4].X, literal.x, -; R600-NEXT: ADD T2.Y, PS, PV.W, -; R600-NEXT: FLT_TO_INT T0.Z, PV.Z, -; R600-NEXT: TRUNC T0.W, T1.X, -; R600-NEXT: EXP_IEEE * T1.X, PV.Y, -; R600-NEXT: 1109008539(3.853184e+01), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T4.X, PS, literal.x, -; R600-NEXT: FLT_TO_INT T1.Y, PV.W, -; R600-NEXT: MAX_INT T1.Z, PV.Z, literal.y, -; R600-NEXT: MUL_IEEE T0.W, PS, literal.z, -; R600-NEXT: EXP_IEEE * T1.W, PV.Y, -; R600-NEXT: 2130706432(1.701412e+38), -330(nan) +; R600-NEXT: MULADD_IEEE T0.X, T0.X, literal.x, PS, +; R600-NEXT: ADD T0.Y, T5.X, -PV.W, BS:VEC_120/SCL_212 +; R600-NEXT: MUL_IEEE T0.Z, PV.Z, PV.Y, +; R600-NEXT: SETGT T0.W, literal.y, KC0[4].X, +; R600-NEXT: EXP_IEEE * T1.Y, PV.X, +; R600-NEXT: 975668412(6.390323e-04), -1036817932(-4.485347e+01) +; R600-NEXT: CNDE T3.X, PV.W, PV.Z, 0.0, +; R600-NEXT: ADD T0.Y, PV.Y, PV.X, +; R600-NEXT: FLT_TO_INT T0.Z, T2.Y, +; R600-NEXT: TRUNC T0.W, T1.W, +; R600-NEXT: MUL_IEEE * T1.W, PS, literal.x, ; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T5.X, PV.W, literal.x, -; R600-NEXT: MUL_IEEE T2.Y, PS, literal.x, -; R600-NEXT: ADD_INT T1.Z, PV.Z, literal.y, -; R600-NEXT: ADD_INT T2.W, T0.Z, literal.z, -; R600-NEXT: MAX_INT * T3.W, PV.Y, literal.w, -; R600-NEXT: 209715200(1.972152e-31), 204(2.858649e-43) -; R600-NEXT: 102(1.429324e-43), -330(nan) -; R600-NEXT: SETGT_UINT T6.X, T0.Z, literal.x, -; R600-NEXT: ADD_INT T3.Y, PS, literal.y, -; R600-NEXT: ADD_INT T2.Z, T1.Y, literal.z, -; R600-NEXT: SETGT_UINT T3.W, T1.Y, literal.x, -; R600-NEXT: MIN_INT * T4.W, T1.Y, literal.w, +; R600-NEXT: SETGT T0.X, KC0[4].X, literal.x, +; R600-NEXT: MUL_IEEE T2.Y, PS, literal.y, +; R600-NEXT: FLT_TO_INT T1.Z, PV.W, +; R600-NEXT: MAX_INT T0.W, PV.Z, literal.z, +; R600-NEXT: EXP_IEEE * T0.Y, PV.Y, +; R600-NEXT: 1109008539(3.853184e+01), 209715200(1.972152e-31) +; R600-NEXT: -330(nan), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T4.X, T1.Y, literal.x, +; R600-NEXT: MUL_IEEE T3.Y, PS, literal.y, +; R600-NEXT: ADD_INT T2.Z, PV.W, literal.z, +; R600-NEXT: ADD_INT * T0.W, T0.Z, literal.w, +; R600-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; R600-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; R600-NEXT: MAX_INT * T2.W, T1.Z, literal.x, +; R600-NEXT: -330(nan), 0(0.000000e+00) +; R600-NEXT: SETGT_UINT T5.X, T0.Z, literal.x, +; R600-NEXT: ADD_INT T4.Y, PV.W, literal.y, +; R600-NEXT: ADD_INT T3.Z, T1.Z, literal.z, BS:VEC_120/SCL_212 +; R600-NEXT: SETGT_UINT T2.W, T1.Z, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: MIN_INT * T3.W, T1.Z, literal.w, ; R600-NEXT: -229(nan), 204(2.858649e-43) ; R600-NEXT: 102(1.429324e-43), 381(5.338947e-43) -; R600-NEXT: ADD_INT T7.X, PS, literal.x, -; R600-NEXT: ADD_INT T4.Y, T1.Y, literal.y, -; R600-NEXT: SETGT_UINT T3.Z, T1.Y, literal.z, -; R600-NEXT: CNDE_INT T4.W, PV.W, PV.Y, PV.Z, -; R600-NEXT: SETGT_INT * T5.W, T1.Y, literal.y, +; R600-NEXT: ADD_INT T6.X, PS, literal.x, +; R600-NEXT: ADD_INT T5.Y, T1.Z, literal.y, +; R600-NEXT: SETGT_UINT T4.Z, T1.Z, literal.z, +; R600-NEXT: CNDE_INT T3.W, PV.W, PV.Y, PV.Z, +; R600-NEXT: SETGT_INT * T4.W, T1.Z, literal.y, ; R600-NEXT: -254(nan), -127(nan) ; R600-NEXT: 254(3.559298e-43), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T8.X, PS, PV.W, T1.Y, -; R600-NEXT: CNDE_INT T3.Y, PV.Z, PV.Y, PV.X, -; R600-NEXT: SETGT_INT T2.Z, T1.Y, literal.x, -; R600-NEXT: CNDE_INT T2.W, T6.X, T1.Z, T2.W, -; R600-NEXT: SETGT_INT * T4.W, T0.Z, literal.y, +; R600-NEXT: CNDE_INT T7.X, PS, PV.W, T1.Z, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T4.Y, PV.Z, PV.Y, PV.X, +; R600-NEXT: SETGT_INT T1.Z, T1.Z, literal.x, BS:VEC_120/SCL_212 +; R600-NEXT: CNDE_INT T0.W, T5.X, T2.Z, T0.W, BS:VEC_102/SCL_221 +; R600-NEXT: SETGT_INT * T3.W, T0.Z, literal.y, ; R600-NEXT: 127(1.779649e-43), -127(nan) -; R600-NEXT: CNDE_INT T7.X, PS, PV.W, T0.Z, -; R600-NEXT: CNDE_INT T1.Y, PV.Z, PV.X, PV.Y, -; R600-NEXT: MIN_INT T1.Z, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE T2.W, T1.W, literal.y, -; R600-NEXT: MUL_IEEE * T6.W, T2.Y, literal.z, -; R600-NEXT: 381(5.338947e-43), 2130706432(1.701412e+38) -; R600-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T8.X, T3.W, PS, T2.Y, -; R600-NEXT: MUL_IEEE T2.Y, PV.W, literal.x, -; R600-NEXT: ADD_INT T1.Z, PV.Z, literal.y, -; R600-NEXT: ADD_INT T3.W, T0.Z, literal.z, -; R600-NEXT: SETGT_UINT * T6.W, T0.Z, literal.w, +; R600-NEXT: CNDE_INT T6.X, PS, PV.W, T0.Z, +; R600-NEXT: CNDE_INT T4.Y, PV.Z, PV.X, PV.Y, +; R600-NEXT: MIN_INT T2.Z, T0.Z, literal.x, +; R600-NEXT: MUL_IEEE T0.W, T3.Y, literal.y, +; R600-NEXT: MUL_IEEE * T5.W, T0.Y, literal.z, +; R600-NEXT: 381(5.338947e-43), 209715200(1.972152e-31) +; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE T7.X, PS, literal.x, +; R600-NEXT: CNDE_INT T3.Y, T2.W, PV.W, T3.Y, +; R600-NEXT: ADD_INT T2.Z, PV.Z, literal.y, +; R600-NEXT: ADD_INT T0.W, T0.Z, literal.z, +; R600-NEXT: SETGT_UINT * T2.W, T0.Z, literal.w, ; R600-NEXT: 2130706432(1.701412e+38), -254(nan) ; R600-NEXT: -127(nan), 254(3.559298e-43) -; R600-NEXT: CNDE_INT T9.X, PS, PV.W, PV.Z, -; R600-NEXT: SETGT_INT T3.Y, T0.Z, literal.x, -; R600-NEXT: CNDE_INT T0.Z, T3.Z, T2.W, PV.Y, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE_INT T1.W, T5.W, PV.X, T1.W, BS:VEC_021/SCL_122 -; R600-NEXT: LSHL * T2.W, T1.Y, literal.y, +; R600-NEXT: CNDE_INT T8.X, PS, PV.W, PV.Z, +; R600-NEXT: SETGT_INT T5.Y, T0.Z, literal.x, +; R600-NEXT: CNDE_INT T0.Z, T4.W, PV.Y, T0.Y, BS:VEC_021/SCL_122 +; R600-NEXT: CNDE_INT T0.W, T4.Z, T5.W, PV.X, BS:VEC_120/SCL_212 +; R600-NEXT: LSHL * T4.W, T4.Y, literal.y, ; R600-NEXT: 127(1.779649e-43), 23(3.222986e-44) -; R600-NEXT: ADD_INT T8.X, PS, literal.x, -; R600-NEXT: CNDE_INT T1.Y, T2.Z, PV.W, PV.Z, -; R600-NEXT: CNDE_INT T0.Z, PV.Y, T7.X, PV.X, -; R600-NEXT: CNDE_INT * T0.W, T6.X, T5.X, T0.W, BS:VEC_021/SCL_122 -; R600-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE * T1.W, T4.X, literal.x, -; R600-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; R600-NEXT: CNDE_INT T4.X, T6.W, T4.X, PV.W, -; R600-NEXT: CNDE_INT * T2.Y, T4.W, T0.W, T1.X, BS:VEC_120/SCL_212 -; R600-NEXT: ALU clause starting at 204: +; R600-NEXT: ADD_INT T7.X, PS, literal.x, +; R600-NEXT: CNDE_INT T0.Y, T1.Z, PV.Z, PV.W, +; R600-NEXT: CNDE_INT T0.Z, PV.Y, T6.X, PV.X, +; R600-NEXT: MUL_IEEE T0.W, T4.X, literal.y, +; R600-NEXT: CNDE_INT * T1.W, T5.X, T2.Y, T1.W, +; R600-NEXT: 1065353216(1.000000e+00), 2130706432(1.701412e+38) +; R600-NEXT: CNDE_INT T5.X, T3.W, PS, T1.Y, +; R600-NEXT: CNDE_INT * T1.Y, T2.W, T4.X, PV.W, BS:VEC_120/SCL_212 +; R600-NEXT: ALU clause starting at 201: ; R600-NEXT: LSHL T0.Z, T0.Z, literal.x, -; R600-NEXT: MUL_IEEE T0.W, T1.Y, T8.X, +; R600-NEXT: MUL_IEEE T0.W, T0.Y, T7.X, ; R600-NEXT: SETGT * T1.W, literal.y, KC0[3].W, ; R600-NEXT: 23(3.222986e-44), -1036817932(-4.485347e+01) -; R600-NEXT: CNDE T1.X, PS, PV.W, 0.0, -; R600-NEXT: SETGT T1.Y, KC0[3].W, literal.x, +; R600-NEXT: CNDE T4.X, PS, PV.W, 0.0, +; R600-NEXT: SETGT T0.Y, KC0[3].W, literal.x, ; R600-NEXT: ADD_INT T0.Z, PV.Z, literal.y, -; R600-NEXT: CNDE_INT T0.W, T3.Y, T2.Y, T4.X, BS:VEC_120/SCL_212 -; R600-NEXT: CNDE * T1.W, T2.X, T3.X, literal.z, +; R600-NEXT: CNDE_INT T0.W, T5.Y, T5.X, T1.Y, BS:VEC_102/SCL_221 +; R600-NEXT: CNDE * T1.W, T0.X, T3.X, literal.z, ; R600-NEXT: 1109008539(3.853184e+01), 1065353216(1.000000e+00) ; R600-NEXT: 2139095040(INF), 0(0.000000e+00) -; R600-NEXT: MUL_IEEE T2.X, PV.W, PV.Z, +; R600-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, ; R600-NEXT: SETGT T2.Y, literal.x, KC0[3].Y, ; R600-NEXT: CNDE T1.Z, PV.Y, PV.X, literal.y, -; R600-NEXT: CNDE T0.W, T0.X, T0.Y, 0.0, +; R600-NEXT: CNDE T0.W, T2.X, T1.X, 0.0, ; R600-NEXT: SETGT * T2.W, KC0[3].Z, literal.z, ; R600-NEXT: -1036817932(-4.485347e+01), 2139095040(INF) ; R600-NEXT: 1109008539(3.853184e+01), 0(0.000000e+00) @@ -2287,8 +2275,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; CM-LABEL: s_exp10_v4f32: ; CM: ; %bb.0: ; CM-NEXT: ALU 97, @6, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 100, @104, KC0[CB0:0-32], KC1[] -; CM-NEXT: ALU 36, @205, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 97, @104, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 35, @202, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD @@ -2307,224 +2295,220 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) ; CM-NEXT: MULADD_IEEE T0.X, T0.W, literal.x, PV.W, ; CM-NEXT: ADD T0.Y, T0.Z, -PV.Z, -; CM-NEXT: MUL_IEEE T0.Z, PV.Y, literal.x, -; CM-NEXT: MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: MUL_IEEE T0.Z, T2.W, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: MUL_IEEE * T0.W, PV.Y, literal.x, ; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) ; CM-NEXT: TRUNC T1.X, T1.Z, -; CM-NEXT: RNDNE T2.Y, PV.W, -; CM-NEXT: MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z, -; CM-NEXT: ADD * T1.W, PV.Y, PV.X, +; CM-NEXT: MULADD_IEEE T1.Y, T1.Y, literal.x, PV.W, +; CM-NEXT: RNDNE T1.Z, PV.Z, +; CM-NEXT: ADD * T0.W, PV.Y, PV.X, ; CM-NEXT: 1079283712(3.321289e+00), 0(0.000000e+00) +; CM-NEXT: EXP_IEEE T0.X, T0.W, +; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, +; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, +; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, +; CM-NEXT: TRUNC T2.X, T1.Z, +; CM-NEXT: MULADD_IEEE T0.Y, T2.W, literal.x, T1.Y, +; CM-NEXT: FLT_TO_INT T2.Z, T1.X, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, +; CM-NEXT: 975668412(6.390323e-04), 209715200(1.972152e-31) +; CM-NEXT: ADD T1.X, T0.Z, -T1.Z, +; CM-NEXT: MUL_IEEE T1.Y, PV.W, literal.x, +; CM-NEXT: MAX_INT T0.Z, PV.Z, literal.y, +; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) +; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) +; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T0.Z, T2.Z, literal.z, +; CM-NEXT: SETGT_UINT * T1.W, T2.Z, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) +; CM-NEXT: 102(1.429324e-43), -229(nan) +; CM-NEXT: ADD_INT T4.X, T2.Z, literal.x, +; CM-NEXT: SETGT_UINT T3.Y, T2.Z, literal.y, +; CM-NEXT: CNDE_INT T0.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T2.W, T2.Z, literal.x, +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: MUL_IEEE T5.X, T0.X, literal.x, +; CM-NEXT: CNDE_INT T2.Y, PV.W, PV.Z, T2.Z, +; CM-NEXT: CNDE_INT T0.Z, PV.Y, PV.X, T3.X, +; CM-NEXT: SETGT_INT * T3.W, T2.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 127(1.779649e-43) +; CM-NEXT: AND_INT T3.X, KC0[3].Z, literal.x, +; CM-NEXT: CNDE_INT T2.Y, PV.W, PV.Y, PV.Z, +; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.y, +; CM-NEXT: CNDE_INT * T0.W, T1.W, T1.Y, T0.W, +; CM-NEXT: -4096(nan), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T0.X, T2.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T1.Y, T3.Y, T5.X, PV.Z, +; CM-NEXT: LSHL T0.Z, PV.Y, literal.x, +; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, +; CM-NEXT: 23(3.222986e-44), 1079283712(3.321289e+00) +; CM-NEXT: RNDNE T4.X, PV.W, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Z, T3.W, PV.X, PV.Y, +; CM-NEXT: ADD * T1.W, T1.X, T0.Y, +; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) ; CM-NEXT: EXP_IEEE T0.X, T1.W, ; CM-NEXT: EXP_IEEE T0.Y (MASKED), T1.W, ; CM-NEXT: EXP_IEEE T0.Z (MASKED), T1.W, ; CM-NEXT: EXP_IEEE * T0.W (MASKED), T1.W, -; CM-NEXT: MULADD_IEEE T2.X, T2.W, literal.x, T0.Z, -; CM-NEXT: ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212 -; CM-NEXT: FLT_TO_INT T0.Z, T1.X, -; CM-NEXT: MUL_IEEE * T0.W, PV.X, literal.y, -; CM-NEXT: 975668412(6.390323e-04), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T1.X, PV.W, literal.x, +; CM-NEXT: MUL_IEEE T1.X, T0.Z, T2.Y, +; CM-NEXT: TRUNC T0.Y, T4.X, +; CM-NEXT: FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212 +; CM-NEXT: MUL_IEEE * T1.W, PV.X, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T2.X, PV.W, literal.x, ; CM-NEXT: MUL_IEEE T1.Y, T0.X, literal.y, ; CM-NEXT: MAX_INT T1.Z, PV.Z, literal.z, -; CM-NEXT: MIN_INT * T1.W, PV.Z, literal.w, +; CM-NEXT: MIN_INT * T2.W, PV.Z, literal.w, ; CM-NEXT: 209715200(1.972152e-31), 2130706432(1.701412e+38) ; CM-NEXT: -330(nan), 381(5.338947e-43) -; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T3.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T5.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.y, ; CM-NEXT: ADD_INT T1.Z, T0.Z, literal.z, -; CM-NEXT: SETGT_UINT * T1.W, T0.Z, literal.w, +; CM-NEXT: SETGT_UINT * T2.W, T0.Z, literal.w, ; CM-NEXT: -254(nan), 204(2.858649e-43) ; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: ADD_INT T4.X, T0.Z, literal.x, -; CM-NEXT: SETGT_UINT T4.Y, T0.Z, literal.y, +; CM-NEXT: ADD_INT T6.X, T0.Z, literal.x, +; CM-NEXT: SETGT_UINT T3.Y, T0.Z, literal.y, ; CM-NEXT: CNDE_INT T1.Z, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT * T2.W, T0.Z, literal.x, +; CM-NEXT: SETGT_INT * T3.W, T0.Z, literal.x, ; CM-NEXT: -127(nan), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T5.X, PV.W, PV.Z, T0.Z, -; CM-NEXT: CNDE_INT T3.Y, PV.Y, PV.X, T3.X, -; CM-NEXT: SETGT_INT T0.Z, T0.Z, literal.x, -; CM-NEXT: MUL_IEEE * T3.W, T1.Y, literal.y, -; CM-NEXT: 127(1.779649e-43), 2130706432(1.701412e+38) -; CM-NEXT: CNDE_INT T3.X, T4.Y, T1.Y, PV.W, -; CM-NEXT: AND_INT T1.Y, KC0[3].Z, literal.x, -; CM-NEXT: CNDE_INT T1.Z, PV.Z, PV.X, PV.Y, -; CM-NEXT: CNDE_INT * T0.W, T1.W, T1.X, T0.W, -; CM-NEXT: -4096(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T0.X, T2.W, PV.W, T0.X, -; CM-NEXT: LSHL T3.Y, PV.Z, literal.x, -; CM-NEXT: TRUNC T1.Z, T2.Y, -; CM-NEXT: ADD * T0.W, KC0[3].Z, -PV.Y, -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T1.X, PV.W, literal.x, -; CM-NEXT: FLT_TO_INT T2.Y, PV.Z, -; CM-NEXT: ADD_INT T1.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T1.W, T0.Z, PV.X, T3.X, -; CM-NEXT: 975668412(6.390323e-04), 1065353216(1.000000e+00) -; CM-NEXT: MUL_IEEE T0.X, PV.W, PV.Z, -; CM-NEXT: MIN_INT T3.Y, PV.Y, literal.x, -; CM-NEXT: MULADD_IEEE T0.Z, T0.W, literal.y, PV.X, -; CM-NEXT: ADD * T0.W, T0.Y, T2.X, -; CM-NEXT: 381(5.338947e-43), 1079283712(3.321289e+00) -; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, -; CM-NEXT: EXP_IEEE T0.Y, T0.W, -; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, -; CM-NEXT: EXP_IEEE * T0.W (MASKED), T0.W, -; CM-NEXT: MULADD_IEEE T1.X, T1.Y, literal.x, T0.Z, -; CM-NEXT: MUL_IEEE T4.Y, PV.Y, literal.y, -; CM-NEXT: ADD_INT T0.Z, T3.Y, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: MAX_INT * T0.W, T2.Y, literal.w, BS:VEC_201 -; CM-NEXT: 975668412(6.390323e-04), 2130706432(1.701412e+38) -; CM-NEXT: -254(nan), -330(nan) -; CM-NEXT: ADD_INT T2.X, T2.Y, literal.x, -; CM-NEXT: ADD_INT T3.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T1.Z, T2.Y, literal.z, -; CM-NEXT: SETGT_UINT * T0.W, T2.Y, literal.w, -; CM-NEXT: -127(nan), 204(2.858649e-43) -; CM-NEXT: 102(1.429324e-43), -229(nan) -; CM-NEXT: SETGT_UINT T3.X, T2.Y, literal.x, -; CM-NEXT: CNDE_INT T3.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T1.Z, T2.Y, literal.y, -; CM-NEXT: MUL_IEEE * T1.W, T0.Y, literal.z, BS:VEC_120/SCL_212 -; CM-NEXT: 254(3.559298e-43), -127(nan) -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T4.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT * T3.Y, PV.Z, PV.Y, T2.Y, -; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) -; CM-NEXT: ALU clause starting at 104: -; CM-NEXT: CNDE_INT T0.Z, T3.X, T2.X, T0.Z, -; CM-NEXT: SETGT_INT * T2.W, T2.Y, literal.x, +; CM-NEXT: CNDE_INT T7.X, PV.W, PV.Z, T0.Z, +; CM-NEXT: CNDE_INT T2.Y, PV.Y, PV.X, T5.X, +; CM-NEXT: SETGT_INT * T0.Z, T0.Z, literal.x, ; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T2.X, T1.Y, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, T3.Y, PV.Z, -; CM-NEXT: CNDE_INT T0.Z, T0.W, T4.X, T1.W, -; CM-NEXT: MUL_IEEE * T0.W, T4.Y, literal.y, BS:VEC_201 -; CM-NEXT: 1079283712(3.321289e+00), 2130706432(1.701412e+38) -; CM-NEXT: AND_INT T4.X, KC0[4].X, literal.x, -; CM-NEXT: CNDE_INT T2.Y, T3.X, T4.Y, PV.W, -; CM-NEXT: CNDE_INT T0.Z, T1.Z, PV.Z, T0.Y, -; CM-NEXT: LSHL * T0.W, PV.Y, literal.y, -; CM-NEXT: -4096(nan), 23(3.222986e-44) -; CM-NEXT: ADD_INT T3.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T0.Y, T2.W, PV.Z, PV.Y, -; CM-NEXT: MUL_IEEE T0.Z, PV.X, literal.y, -; CM-NEXT: RNDNE * T0.W, T2.X, -; CM-NEXT: 1065353216(1.000000e+00), 1079283712(3.321289e+00) -; CM-NEXT: ADD T2.X, T2.X, -PV.W, -; CM-NEXT: RNDNE T1.Y, PV.Z, -; CM-NEXT: MUL_IEEE T1.Z, PV.Y, PV.X, -; CM-NEXT: SETGT * T1.W, literal.x, KC0[3].W, -; CM-NEXT: -1036817932(-4.485347e+01), 0(0.000000e+00) -; CM-NEXT: CNDE T3.X, PV.W, PV.Z, 0.0, -; CM-NEXT: TRUNC T0.Y, T0.W, -; CM-NEXT: TRUNC T1.Z, PV.Y, -; CM-NEXT: ADD * T0.W, PV.X, T1.X, +; CM-NEXT: ALU clause starting at 104: +; CM-NEXT: ADD * T4.W, KC0[3].Z, -T3.X, +; CM-NEXT: MUL_IEEE T5.X, PV.W, literal.x, +; CM-NEXT: CNDE_INT T2.Y, T0.Z, T7.X, T2.Y, +; CM-NEXT: MUL_IEEE T1.Z, T1.Y, literal.y, +; CM-NEXT: CNDE_INT * T1.W, T2.W, T2.X, T1.W, BS:VEC_021/SCL_122 +; CM-NEXT: 975668412(6.390323e-04), 2130706432(1.701412e+38) +; CM-NEXT: CNDE_INT T0.X, T3.W, PV.W, T0.X, +; CM-NEXT: CNDE_INT T1.Y, T3.Y, T1.Y, PV.Z, +; CM-NEXT: LSHL T1.Z, PV.Y, literal.x, +; CM-NEXT: MULADD_IEEE * T1.W, T4.W, literal.y, PV.X, BS:VEC_120/SCL_212 +; CM-NEXT: 23(3.222986e-44), 1079283712(3.321289e+00) +; CM-NEXT: MULADD_IEEE T2.X, T3.X, literal.x, PV.W, +; CM-NEXT: ADD T2.Y, T0.W, -T4.X, +; CM-NEXT: ADD_INT T1.Z, PV.Z, literal.y, +; CM-NEXT: CNDE_INT * T0.W, T0.Z, PV.X, PV.Y, +; CM-NEXT: 975668412(6.390323e-04), 1065353216(1.000000e+00) +; CM-NEXT: AND_INT T0.X, KC0[4].X, literal.x, +; CM-NEXT: MUL_IEEE T1.Y, PV.W, PV.Z, +; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].W, +; CM-NEXT: ADD * T0.W, PV.Y, PV.X, +; CM-NEXT: -4096(nan), -1036817932(-4.485347e+01) ; CM-NEXT: EXP_IEEE T0.X (MASKED), T0.W, ; CM-NEXT: EXP_IEEE T0.Y (MASKED), T0.W, ; CM-NEXT: EXP_IEEE T0.Z (MASKED), T0.W, ; CM-NEXT: EXP_IEEE * T0.W, T0.W, -; CM-NEXT: FLT_TO_INT T1.X, T1.Z, -; CM-NEXT: FLT_TO_INT T0.Y, T0.Y, -; CM-NEXT: MUL_IEEE T1.Z, PV.W, literal.x, -; CM-NEXT: ADD * T1.W, KC0[4].X, -T4.X, -; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: MUL_IEEE T2.X, PV.W, literal.x, -; CM-NEXT: MUL_IEEE T2.Y, T0.W, literal.y, -; CM-NEXT: MUL_IEEE T2.Z, PV.Z, literal.z, -; CM-NEXT: SETGT_UINT * T2.W, PV.Y, literal.w, -; CM-NEXT: 975668412(6.390323e-04), 209715200(1.972152e-31) -; CM-NEXT: 2130706432(1.701412e+38), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T5.X, PV.W, T1.Z, PV.Z, -; CM-NEXT: MUL_IEEE T3.Y, PV.Y, literal.x, -; CM-NEXT: MULADD_IEEE T1.Z, T1.W, literal.y, PV.X, -; CM-NEXT: MAX_INT * T1.W, T1.X, literal.z, -; CM-NEXT: 209715200(1.972152e-31), 1079283712(3.321289e+00) -; CM-NEXT: -330(nan), 0(0.000000e+00) -; CM-NEXT: ADD_INT T2.X, PV.W, literal.x, -; CM-NEXT: ADD_INT T4.Y, T1.X, literal.y, -; CM-NEXT: MULADD_IEEE T1.Z, T4.X, literal.z, PV.Z, BS:VEC_120/SCL_212 -; CM-NEXT: MAX_INT * T1.W, T0.Y, literal.w, -; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) +; CM-NEXT: CNDE T2.X, T0.Z, T1.Y, 0.0, +; CM-NEXT: ADD T1.Y, KC0[4].X, -T0.X, +; CM-NEXT: FLT_TO_INT T0.Z, T0.Y, +; CM-NEXT: MUL_IEEE * T1.W, PV.W, literal.x, +; CM-NEXT: 209715200(1.972152e-31), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE T3.X, PV.W, literal.x, +; CM-NEXT: SETGT_UINT T0.Y, PV.Z, literal.y, +; CM-NEXT: MUL_IEEE T1.Z, PV.Y, literal.z, +; CM-NEXT: MUL_IEEE * T2.W, T0.X, literal.w, +; CM-NEXT: 209715200(1.972152e-31), -229(nan) +; CM-NEXT: 975668412(6.390323e-04), 1079283712(3.321289e+00) +; CM-NEXT: RNDNE T4.X, PV.W, +; CM-NEXT: MULADD_IEEE T1.Y, T1.Y, literal.x, PV.Z, +; CM-NEXT: CNDE_INT T1.Z, PV.Y, PV.X, T1.W, +; CM-NEXT: SETGT_INT * T1.W, T0.Z, literal.y, +; CM-NEXT: 1079283712(3.321289e+00), -127(nan) +; CM-NEXT: CNDE_INT T3.X, PV.W, PV.Z, T0.W, +; CM-NEXT: MULADD_IEEE T1.Y, T0.X, literal.x, PV.Y, +; CM-NEXT: ADD T1.Z, T2.W, -PV.X, +; CM-NEXT: MAX_INT * T2.W, T0.Z, literal.y, ; CM-NEXT: 975668412(6.390323e-04), -330(nan) -; CM-NEXT: ADD T4.X, T0.Z, -T1.Y, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.x, -; CM-NEXT: ADD_INT T0.Z, T0.Y, literal.y, -; CM-NEXT: SETGT_UINT * T1.W, T0.Y, literal.z, +; CM-NEXT: ADD_INT T0.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, T0.Z, literal.y, +; CM-NEXT: TRUNC T2.Z, T4.X, +; CM-NEXT: ADD * T2.W, PV.Z, PV.Y, ; CM-NEXT: 204(2.858649e-43), 102(1.429324e-43) -; CM-NEXT: -229(nan), 0(0.000000e+00) -; CM-NEXT: SETGT_UINT T6.X, T1.X, literal.x, -; CM-NEXT: CNDE_INT T1.Y, PV.W, PV.Y, PV.Z, -; CM-NEXT: SETGT_INT T0.Z, T0.Y, literal.y, -; CM-NEXT: ADD * T3.W, PV.X, T1.Z, -; CM-NEXT: -229(nan), -127(nan) -; CM-NEXT: EXP_IEEE T1.X (MASKED), T3.W, -; CM-NEXT: EXP_IEEE T1.Y (MASKED), T3.W, -; CM-NEXT: EXP_IEEE T1.Z, T3.W, -; CM-NEXT: EXP_IEEE * T1.W (MASKED), T3.W, -; CM-NEXT: CNDE_INT T4.X, T0.Z, T1.Y, T0.Y, -; CM-NEXT: CNDE_INT T1.Y, T6.X, T2.X, T4.Y, BS:VEC_120/SCL_212 -; CM-NEXT: SETGT_INT T2.Z, T1.X, literal.x, -; CM-NEXT: MUL_IEEE * T3.W, PV.Z, literal.y, -; CM-NEXT: -127(nan), 209715200(1.972152e-31) -; CM-NEXT: MUL_IEEE T2.X, T1.Z, literal.x, -; CM-NEXT: MUL_IEEE T4.Y, PV.W, literal.y, -; CM-NEXT: CNDE_INT T3.Z, PV.Z, PV.Y, T1.X, -; CM-NEXT: MIN_INT * T4.W, T1.X, literal.z, +; CM-NEXT: EXP_IEEE T1.X (MASKED), T2.W, +; CM-NEXT: EXP_IEEE T1.Y, T2.W, +; CM-NEXT: EXP_IEEE T1.Z (MASKED), T2.W, +; CM-NEXT: EXP_IEEE * T1.W (MASKED), T2.W, +; CM-NEXT: MUL_IEEE T4.X, T0.W, literal.x, +; CM-NEXT: FLT_TO_INT T3.Y, T2.Z, +; CM-NEXT: MUL_IEEE T1.Z, PV.Y, literal.y, +; CM-NEXT: CNDE_INT * T0.W, T0.Y, T0.X, T2.Y, ; CM-NEXT: 2130706432(1.701412e+38), 209715200(1.972152e-31) +; CM-NEXT: CNDE_INT T0.X, T1.W, PV.W, T0.Z, +; CM-NEXT: MUL_IEEE T0.Y, PV.Z, literal.x, +; CM-NEXT: MAX_INT T2.Z, PV.Y, literal.y, +; CM-NEXT: MIN_INT * T0.W, PV.Y, literal.z, +; CM-NEXT: 209715200(1.972152e-31), -330(nan) ; CM-NEXT: 381(5.338947e-43), 0(0.000000e+00) -; CM-NEXT: MIN_INT T7.X, T0.Y, literal.x, -; CM-NEXT: ADD_INT T1.Y, PV.W, literal.y, -; CM-NEXT: ADD_INT T4.Z, T1.X, literal.z, -; CM-NEXT: SETGT_UINT * T4.W, T1.X, literal.w, -; CM-NEXT: 381(5.338947e-43), -254(nan) +; CM-NEXT: ADD_INT T5.X, PV.W, literal.x, +; CM-NEXT: ADD_INT T2.Y, PV.Z, literal.y, +; CM-NEXT: ADD_INT T2.Z, T3.Y, literal.z, +; CM-NEXT: SETGT_UINT * T0.W, T3.Y, literal.w, +; CM-NEXT: -254(nan), 204(2.858649e-43) +; CM-NEXT: 102(1.429324e-43), -229(nan) +; CM-NEXT: ADD_INT T6.X, T3.Y, literal.x, +; CM-NEXT: SETGT_UINT T4.Y, T3.Y, literal.y, +; CM-NEXT: CNDE_INT T2.Z, PV.W, PV.Y, PV.Z, +; CM-NEXT: SETGT_INT * T1.W, T3.Y, literal.x, ; CM-NEXT: -127(nan), 254(3.559298e-43) -; CM-NEXT: CNDE_INT T8.X, PV.W, PV.Z, PV.Y, -; CM-NEXT: SETGT_INT T1.Y, T1.X, literal.x, -; CM-NEXT: ADD_INT T4.Z, PV.X, literal.y, -; CM-NEXT: ADD_INT * T5.W, T0.Y, literal.z, +; CM-NEXT: MUL_IEEE T7.X, T1.Y, literal.x, +; CM-NEXT: CNDE_INT T2.Y, PV.W, PV.Z, T3.Y, +; CM-NEXT: CNDE_INT T2.Z, PV.Y, PV.X, T5.X, +; CM-NEXT: MIN_INT * T2.W, T0.Z, literal.y, +; CM-NEXT: 2130706432(1.701412e+38), 381(5.338947e-43) +; CM-NEXT: SETGT_INT T5.X, T3.Y, literal.x, +; CM-NEXT: ADD_INT T3.Y, PV.W, literal.y, +; CM-NEXT: ADD_INT T3.Z, T0.Z, literal.z, +; CM-NEXT: SETGT_UINT * T2.W, T0.Z, literal.w, ; CM-NEXT: 127(1.779649e-43), -254(nan) -; CM-NEXT: -127(nan), 0(0.000000e+00) -; CM-NEXT: CNDE_INT T1.X, T2.W, PV.W, PV.Z, -; CM-NEXT: CNDE_INT T5.Y, PV.Y, T3.Z, PV.X, -; CM-NEXT: CNDE_INT T3.Z, T6.X, T4.Y, T3.W, -; CM-NEXT: MUL_IEEE * T2.W, T2.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: -127(nan), 254(3.559298e-43) +; CM-NEXT: CNDE_INT T6.X, PV.W, PV.Z, PV.Y, +; CM-NEXT: CNDE_INT T2.Y, PV.X, T2.Y, T2.Z, +; CM-NEXT: MUL_IEEE T2.Z, T7.X, literal.x, +; CM-NEXT: CNDE_INT * T0.W, T0.W, T0.Y, T1.Z, BS:VEC_021/SCL_122 ; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) -; CM-NEXT: SETGT_INT T6.X, T0.Y, literal.x, -; CM-NEXT: CNDE_INT T0.Y, T4.W, T2.X, PV.W, -; CM-NEXT: CNDE_INT * T1.Z, T2.Z, PV.Z, T1.Z, -; CM-NEXT: 127(1.779649e-43), 0(0.000000e+00) -; CM-NEXT: ALU clause starting at 205: -; CM-NEXT: LSHL * T2.W, T5.Y, literal.x, -; CM-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; CM-NEXT: ADD_INT T2.X, PV.W, literal.x, -; CM-NEXT: CNDE_INT T0.Y, T1.Y, T1.Z, T0.Y, -; CM-NEXT: CNDE_INT * T1.Z, T6.X, T4.X, T1.X, +; CM-NEXT: SETGT_INT T8.X, T0.Z, literal.x, +; CM-NEXT: CNDE_INT T0.Y, T1.W, PV.W, T1.Y, +; CM-NEXT: CNDE_INT T0.Z, T4.Y, T7.X, PV.Z, +; CM-NEXT: LSHL * T0.W, PV.Y, literal.y, +; CM-NEXT: 127(1.779649e-43), 23(3.222986e-44) +; CM-NEXT: ALU clause starting at 202: +; CM-NEXT: ADD_INT T7.X, T0.W, literal.x, +; CM-NEXT: CNDE_INT * T0.Y, T5.X, T0.Y, T0.Z, ; CM-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) -; CM-NEXT: CNDE_INT * T1.W, T1.W, T3.Y, T2.Y, -; CM-NEXT: CNDE_INT T1.X, T0.Z, PV.W, T0.W, -; CM-NEXT: LSHL T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: MUL_IEEE T0.Z, T0.Y, T2.X, +; CM-NEXT: CNDE_INT * T0.Z, T8.X, T0.X, T6.X, +; CM-NEXT: MUL_IEEE * T0.W, T4.X, literal.x, +; CM-NEXT: 2130706432(1.701412e+38), 0(0.000000e+00) +; CM-NEXT: CNDE_INT T0.X, T2.W, T4.X, PV.W, +; CM-NEXT: LSHL T1.Y, T0.Z, literal.x, +; CM-NEXT: MUL_IEEE T0.Z, T0.Y, T7.X, BS:VEC_021/SCL_122 ; CM-NEXT: SETGT * T0.W, literal.y, KC0[4].X, ; CM-NEXT: 23(3.222986e-44), -1036817932(-4.485347e+01) -; CM-NEXT: CNDE T2.X, PV.W, PV.Z, 0.0, +; CM-NEXT: CNDE T4.X, PV.W, PV.Z, 0.0, ; CM-NEXT: SETGT T0.Y, KC0[4].X, literal.x, ; CM-NEXT: ADD_INT T0.Z, PV.Y, literal.y, -; CM-NEXT: CNDE_INT * T0.W, T6.X, PV.X, T5.X, +; CM-NEXT: CNDE_INT * T0.W, T8.X, T3.X, PV.X, ; CM-NEXT: 1109008539(3.853184e+01), 1065353216(1.000000e+00) -; CM-NEXT: SETGT T1.X, KC0[3].W, literal.x, +; CM-NEXT: SETGT T0.X, KC0[3].W, literal.x, ; CM-NEXT: MUL_IEEE T1.Y, PV.W, PV.Z, ; CM-NEXT: SETGT T0.Z, literal.y, KC0[3].Z, ; CM-NEXT: CNDE * T0.W, PV.Y, PV.X, literal.z, ; CM-NEXT: 1109008539(3.853184e+01), -1036817932(-4.485347e+01) ; CM-NEXT: 2139095040(INF), 0(0.000000e+00) -; CM-NEXT: SETGT T2.X, literal.x, KC0[3].Y, +; CM-NEXT: SETGT T3.X, literal.x, KC0[3].Y, ; CM-NEXT: CNDE T0.Y, PV.Z, PV.Y, 0.0, -; CM-NEXT: CNDE T0.Z, PV.X, T3.X, literal.y, +; CM-NEXT: CNDE T0.Z, PV.X, T2.X, literal.y, ; CM-NEXT: SETGT * T1.W, KC0[3].Z, literal.z, ; CM-NEXT: -1036817932(-4.485347e+01), 2139095040(INF) ; CM-NEXT: 1109008539(3.853184e+01), 0(0.000000e+00) ; CM-NEXT: CNDE T0.Y, PV.W, PV.Y, literal.x, -; CM-NEXT: CNDE T1.Z, PV.X, T0.X, 0.0, +; CM-NEXT: CNDE T1.Z, PV.X, T1.X, 0.0, ; CM-NEXT: SETGT * T1.W, KC0[3].Y, literal.y, ; CM-NEXT: 2139095040(INF), 1109008539(3.853184e+01) ; CM-NEXT: CNDE * T0.X, PV.W, PV.Z, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index b1a82daa8e7d..b3f4790df4d4 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -795,17 +795,17 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT T1.Y, T0.Z, literal.x, -; EG-NEXT: LSHR T1.Z, T0.Y, 1, +; EG-NEXT: LSHR T1.Y, T0.Y, 1, +; EG-NEXT: NOT_INT T1.Z, T0.Z, ; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, -; EG-NEXT: NOT_INT * T1.W, T0.Z, +; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.W, T0.X, PV.Y, +; EG-NEXT: LSHL T2.Z, T0.X, PS, +; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z, ; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, -; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, +; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T0.X, T1.W, T2.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 @@ -858,8 +858,8 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 22, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 +; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: @@ -868,27 +868,28 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG-NEXT: ALU clause starting at 10: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x, +; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.X, T0.Z, PV.W, +; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: LSHR T2.Z, T0.W, 1, -; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, +; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, BS:VEC_102/SCL_221 ; EG-NEXT: NOT_INT * T1.W, T1.Z, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T3.X, PV.Z, PV.W, PS, +; EG-NEXT: LSHR T2.Y, T0.Y, 1, +; EG-NEXT: NOT_INT T0.Z, T1.X, +; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, +; EG-NEXT: AND_INT * T1.W, T1.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS, -; EG-NEXT: LSHL * T1.W, T0.Z, PV.Y, -; EG-NEXT: AND_INT T2.X, T1.Z, literal.x, -; EG-NEXT: AND_INT T1.Y, T1.X, literal.y, -; EG-NEXT: LSHR T0.Z, T0.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1, -; EG-NEXT: NOT_INT * T3.W, T1.X, -; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) -; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.Z, T0.X, PV.Y, -; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT * T3.W, PV.X, T0.W, T1.W, +; EG-NEXT: LSHL T0.Y, T0.X, PS, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T1.Z, T1.X, literal.x, BS:VEC_201 +; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, PV.X, T2.X, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT T3.Y, PV.W, PV.Y, PV.Z, -; EG-NEXT: CNDE_INT * T3.Z, T2.X, T1.W, 0.0, -; EG-NEXT: CNDE_INT T3.X, T2.W, T0.Z, 0.0, +; EG-NEXT: CNDE_INT T2.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: CNDE_INT * T2.Z, T1.Y, T2.X, 0.0, +; EG-NEXT: CNDE_INT T2.X, T1.Z, T0.Y, 0.0, ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1 @@ -955,65 +956,66 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 47, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 +; EG-NEXT: ALU 48, @15, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 -; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 -; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 32, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 +; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 +; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: AND_INT T4.Z, T1.Z, literal.x, -; EG-NEXT: LSHR T1.W, T0.W, 1, -; EG-NEXT: NOT_INT * T3.W, T1.Z, +; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1, -; EG-NEXT: AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201 -; EG-NEXT: LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212 -; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221 -; EG-NEXT: NOT_INT * T2.W, T3.Z, -; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T2.Z, T2.Z, PV.Y, -; EG-NEXT: BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W, -; EG-NEXT: LSHL * T1.W, T0.Z, T4.Z, +; EG-NEXT: LSHL * T1.W, T0.Z, PV.W, ; EG-NEXT: AND_INT T4.X, T1.Z, literal.x, -; EG-NEXT: AND_INT T1.Y, T1.X, literal.y, -; EG-NEXT: LSHR T0.Z, T0.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1, -; EG-NEXT: NOT_INT * T3.W, T1.X, +; EG-NEXT: LSHR T1.Y, T3.W, 1, +; EG-NEXT: NOT_INT T4.Z, T2.Z, BS:VEC_201 +; EG-NEXT: BIT_ALIGN_INT T2.W, T3.W, T3.Z, 1, +; EG-NEXT: AND_INT * T3.W, T2.Z, literal.y, ; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) -; EG-NEXT: AND_INT T5.X, T3.Z, literal.x, -; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.Z, T0.X, PV.Y, -; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: CNDE_INT * T4.W, PV.X, T0.W, T1.W, +; EG-NEXT: LSHL T5.X, T3.Z, PS, +; EG-NEXT: AND_INT T2.Y, T2.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T2.Z, PV.Y, PV.W, PV.Z, +; EG-NEXT: LSHR T2.W, T3.Y, 1, +; EG-NEXT: NOT_INT * T3.W, T2.X, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T6.X, T3.Y, T3.X, 1, +; EG-NEXT: AND_INT T1.Y, T2.X, literal.x, +; EG-NEXT: LSHR T3.Z, T0.W, 1, +; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, +; EG-NEXT: NOT_INT * T4.W, T1.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T7.X, PV.Z, PV.W, PS, +; EG-NEXT: LSHL T1.Y, T3.X, PV.Y, BS:VEC_120/SCL_212 +; EG-NEXT: AND_INT T0.Z, T2.X, literal.x, BS:VEC_201 +; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, PV.X, T3.W, +; EG-NEXT: CNDE_INT * T3.W, T2.Y, T2.Z, T5.X, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, T3.X, literal.x, -; EG-NEXT: CNDE_INT T4.Y, PV.W, PV.Y, PV.Z, -; EG-NEXT: LSHR T1.Z, T2.Y, 1, -; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1, -; EG-NEXT: NOT_INT * T3.W, T3.X, +; EG-NEXT: LSHR T2.X, T0.Y, 1, +; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: NOT_INT T1.Z, T1.X, +; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, +; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS, -; EG-NEXT: LSHL T0.Y, T2.X, PV.X, -; EG-NEXT: CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212 -; EG-NEXT: AND_INT * T0.W, T3.X, literal.x, BS:VEC_201 +; EG-NEXT: LSHL T0.X, T0.X, PS, +; EG-NEXT: AND_INT T0.Y, T1.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT T3.Z, T2.Y, T5.X, 0.0, BS:VEC_021/SCL_122 +; EG-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.W, PV.Z, ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T1.W, T5.X, T3.Y, T2.Z, -; EG-NEXT: CNDE_INT T4.X, T2.W, T0.Z, 0.0, -; EG-NEXT: CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212 -; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, +; EG-NEXT: CNDE_INT * T2.W, T4.X, T7.X, T1.W, +; EG-NEXT: CNDE_INT T3.X, T0.Z, T1.Y, 0.0, +; EG-NEXT: CNDE_INT T2.Y, T0.Y, T0.W, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T0.X, PV.W, literal.x, -; EG-NEXT: CNDE_INT T1.Z, T5.X, T2.Z, 0.0, -; EG-NEXT: CNDE_INT * T1.X, T0.W, T0.Y, 0.0, +; EG-NEXT: LSHR T1.X, PV.W, literal.x, +; EG-NEXT: CNDE_INT T2.Z, T4.X, T1.W, 0.0, +; EG-NEXT: CNDE_INT * T2.X, T0.Y, T0.X, 0.0, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 %a = load <4 x i64>, ptr addrspace(1) %in @@ -1172,17 +1174,17 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.W, literal.y, -; EG-NEXT: NOT_INT * T1.W, KC0[2].W, -; EG-NEXT: 31(4.344025e-44), -1(nan) -; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS, -; EG-NEXT: LSHL T0.W, literal.y, PV.Z, +; EG-NEXT: MOV T0.Z, literal.x, +; EG-NEXT: NOT_INT T0.W, KC0[2].W, +; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, +; EG-NEXT: -1(nan), 31(4.344025e-44) +; EG-NEXT: LSHL T1.Z, literal.x, PS, +; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, -; EG-NEXT: 32767(4.591635e-41), -1(nan) +; EG-NEXT: -1(nan), 32767(4.591635e-41) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, -; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, +; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl = shl i64 281474976710655, %a @@ -1423,15 +1425,15 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT T0.W, KC0[2].W, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, +; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, +; EG-NEXT: NOT_INT * T1.W, KC0[2].W, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHL T0.Z, literal.x, PS, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, -; EG-NEXT: 64(8.968310e-44), 32(4.484155e-44) -; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, -; EG-NEXT: CNDE_INT T0.X, T1.W, T0.Z, 0.0, +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, +; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.x, +; EG-NEXT: LSHL * T0.W, literal.y, PV.W, +; EG-NEXT: 32(4.484155e-44), 64(8.968310e-44) +; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS, +; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl = shl i64 64, %a @@ -1903,16 +1905,16 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: NOT_INT T0.W, KC0[2].W, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, +; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, +; EG-NEXT: NOT_INT * T1.W, KC0[2].W, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: LSHL T0.Z, literal.x, PS, -; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W, -; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, -; EG-NEXT: 1082130432(4.000000e+00), 541065216(1.626303e-19) -; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, -; EG-NEXT: CNDE_INT T0.X, T1.W, T0.Z, 0.0, +; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, +; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.y, +; EG-NEXT: LSHL * T0.W, literal.z, PV.W, +; EG-NEXT: 541065216(1.626303e-19), 32(4.484155e-44) +; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS, +; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl = shl i64 1082130432, %a @@ -1959,17 +1961,17 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.W, literal.y, -; EG-NEXT: NOT_INT * T1.W, KC0[2].W, -; EG-NEXT: 31(4.344025e-44), -532676608(-5.534023e+19) -; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS, -; EG-NEXT: LSHL T0.W, literal.y, PV.Z, +; EG-NEXT: MOV T0.Z, literal.x, +; EG-NEXT: NOT_INT T0.W, KC0[2].W, +; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, +; EG-NEXT: -532676608(-5.534023e+19), 31(4.344025e-44) +; EG-NEXT: LSHL T1.Z, literal.x, PS, +; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W, ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, -; EG-NEXT: 2147483647(nan), -1065353216(-4.000000e+00) +; EG-NEXT: -1065353216(-4.000000e+00), 2147483647(nan) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, -; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, +; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl = shl i64 -1065353216, %a diff --git a/llvm/test/CodeGen/X86/misched-critical-path.ll b/llvm/test/CodeGen/X86/misched-critical-path.ll new file mode 100644 index 000000000000..2a95aaa46d4a --- /dev/null +++ b/llvm/test/CodeGen/X86/misched-critical-path.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin8 -misched-print-dags -o - 2>&1 > /dev/null | FileCheck %s +; REQUIRES: asserts + +@sc = common global i8 0 +@uc = common global i8 0 +@ui = common global i32 0 + +; Regression Test for PR92368. +; +; CHECK: SU(8): CMP8rr %4:gr8, %3:gr8, implicit-def $eflags +; CHECK: Predecessors: +; CHECK-NEXT: SU(6): Data Latency=0 Reg=%4 +; CHECK-NEXT: SU(7): Out Latency=0 +; CHECK-NEXT: SU(5): Out Latency=0 +; CHECK-NEXT: SU(3): Data Latency=4 Reg=%3 +define void @misched_bug() nounwind { +entry: + %v0 = load i8, ptr @sc, align 1 + %v1 = zext i8 %v0 to i32 + %v2 = load i8, ptr @uc, align 1 + %v3 = zext i8 %v2 to i32 + %v4 = trunc i32 %v3 to i8 + %v5 = trunc i32 %v1 to i8 + %pair74 = cmpxchg ptr @sc, i8 %v4, i8 %v5 monotonic monotonic + %v6 = extractvalue { i8, i1 } %pair74, 0 + %v7 = icmp eq i8 %v6, %v4 + %v8 = zext i1 %v7 to i8 + %v9 = zext i8 %v8 to i32 + store i32 %v9, ptr @ui, align 4 + br label %return + +return: ; preds = %ventry + ret void +} + From 44086133c53599c3aaa7f93426bffdc2ef6cb42a Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 23 May 2024 10:28:06 +0100 Subject: [PATCH 012/433] [flang][HLFIR][NFC] Reduce intrinsic lowering pass boilerplate (#93062) The pass constructor can be generated automatically. This pass is module-level and then runs on all relevant intrinsic operations inside of the module, no matter what top level operation they are inside of. --- flang/include/flang/Optimizer/HLFIR/Passes.h | 1 - flang/include/flang/Optimizer/HLFIR/Passes.td | 1 - flang/include/flang/Tools/CLOptions.inc | 2 +- .../HLFIR/Transforms/LowerHLFIRIntrinsics.cpp | 11 ----------- 4 files changed, 1 insertion(+), 14 deletions(-) diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h index e0b4111eed6b..4fa619cd53ca 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.h +++ b/flang/include/flang/Optimizer/HLFIR/Passes.h @@ -24,7 +24,6 @@ namespace hlfir { std::unique_ptr createConvertHLFIRtoFIRPass(); std::unique_ptr createBufferizeHLFIRPass(); -std::unique_ptr createLowerHLFIRIntrinsicsPass(); #define GEN_PASS_REGISTRATION #include "flang/Optimizer/HLFIR/Passes.h.inc" diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td index 9555467a57ad..fc3d2a0d4681 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.td +++ b/flang/include/flang/Optimizer/HLFIR/Passes.td @@ -29,7 +29,6 @@ def OptimizedBufferization : Pass<"opt-bufferization"> { def LowerHLFIRIntrinsics : Pass<"lower-hlfir-intrinsics", "::mlir::ModuleOp"> { let summary = "Lower HLFIR transformational intrinsic operations"; - let constructor = "hlfir::createLowerHLFIRIntrinsicsPass()"; } def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::mlir::ModuleOp"> { diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 69934281fa99..56cc9da7de0d 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -328,7 +328,7 @@ inline void createHLFIRToFIRPassPipeline( pm, hlfir::createOptimizedBufferization); } pm.addPass(hlfir::createLowerHLFIROrderedAssignments()); - pm.addPass(hlfir::createLowerHLFIRIntrinsicsPass()); + pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIRPass()); pm.addPass(hlfir::createConvertHLFIRtoFIRPass()); } diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp index e9dbb7095d0e..707c0feffbb3 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp @@ -468,13 +468,6 @@ class LowerHLFIRIntrinsics : public hlfir::impl::LowerHLFIRIntrinsicsBase { public: void runOnOperation() override { - // TODO: make this a pass operating on FuncOp. The issue is that - // FirOpBuilder helpers may generate new FuncOp because of runtime/llvm - // intrinsics calls creation. This may create race conflict if the pass is - // scheduled on FuncOp. A solution could be to provide an optional mutex - // when building a FirOpBuilder and locking around FuncOp and GlobalOp - // creation, but this needs a bit more thinking, so at this point the pass - // is scheduled on the moduleOp. mlir::ModuleOp module = this->getOperation(); mlir::MLIRContext *context = &getContext(); mlir::RewritePatternSet patterns(context); @@ -504,7 +497,3 @@ class LowerHLFIRIntrinsics } }; } // namespace - -std::unique_ptr hlfir::createLowerHLFIRIntrinsicsPass() { - return std::make_unique(); -} From f81da75693fff6c2ffefbb3883e08f11b21ee643 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 23 May 2024 10:42:11 +0100 Subject: [PATCH 013/433] [Clang][AArch64] Use __clang_arm_builtin_alias for overloaded svreinterpret's (#92427) The intrinsics are currently defined as: ``` __aio __attribute__((target("sve"))) svint8_t svreinterpret_s8(svuint8_t op) __arm_streaming_compatible { return __builtin_sve_reinterpret_s8_u8(op); } ``` which doesn't work when calling it from an __arm_streaming function when only +sme is available. By defining it in the same way as we've defined all the other intrinsics, we can leave it to the code in SemaChecking to verify that either +sve or +sme is available. This PR also fixes the target guards for the svreinterpret_c and svreinterpret_b intrinsics, that convert between svcount_t and svbool_t, as these are available both in SME2 and SVE2p1. --- clang/include/clang/Basic/arm_sve.td | 6 +- .../acle_sme2_reinterpret_svcount_svbool.c | 6 +- .../acle_sve_reinterpret-bfloat.c | 57 ++-- .../acle_sve_reinterpret.c | 253 +++++++++--------- ...acle_sve_reinterpret_from_streaming_mode.c | 35 --- clang/utils/TableGen/SveEmitter.cpp | 21 +- 6 files changed, 182 insertions(+), 196 deletions(-) delete mode 100644 clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index a9ea71cd0777..03570f94de66 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2186,9 +2186,6 @@ let TargetGuard = "sme2" in { def SVSQRSHRUN_X4 : SInst<"svqrshrun[_n]_{0}[_{d}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshrun_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; - def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>; - def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>; - // SQDMULH def SVSQDMULH_SINGLE_X2 : SInst<"svqdmulh[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx2", [IsStreaming], []>; def SVSQDMULH_SINGLE_X4 : SInst<"svqdmulh[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx4", [IsStreaming], []>; @@ -2197,6 +2194,9 @@ let TargetGuard = "sme2" in { } let TargetGuard = "sve2p1|sme2" in { + def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>; + def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>; + // SQRSHRN / UQRSHRN def SVQRSHRN_X2 : SInst<"svqrshrn[_n]_{0}[_{d}_x2]", "h2i", "i", MergeNone, "aarch64_sve_sqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; def SVUQRSHRN_X2 : SInst<"svqrshrn[_n]_{0}[_{d}_x2]", "e2i", "Ui", MergeNone, "aarch64_sve_uqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>; diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c index c442d2c0c475..d894e98451b4 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c @@ -2,12 +2,14 @@ // REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -#include +#include #if defined __ARM_FEATURE_SME #define MODE_ATTR __arm_streaming @@ -16,7 +18,7 @@ #endif #ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin.§ +// A simple used,unused... macro, long enough to represent any SVE builtin. #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 #else #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c index bf2cd23e4080..41208bfb1f43 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c @@ -4,6 +4,10 @@ // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2 +// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3 +// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3 @@ -18,9 +22,16 @@ // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include +#if defined __ARM_FEATURE_SME +#define MODE_ATTR __arm_streaming +#else +#define MODE_ATTR +#endif + #ifdef TUPLE #define TYPE_1(base,tuple) base ## tuple ## _t #define TYPE_0(base,tuple) TYPE_1(base,tuple) @@ -81,7 +92,7 @@ // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) { +TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8, _bf16)(op); } @@ -125,7 +136,7 @@ TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) { +TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16, _bf16)(op); } @@ -169,7 +180,7 @@ TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) { +TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32, _bf16)(op); } // CHECK-LABEL: @test_svreinterpret_s64_bf16( @@ -212,7 +223,7 @@ TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) { +TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64, _bf16)(op); } @@ -256,7 +267,7 @@ TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) { +TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8, _bf16)(op); } @@ -300,7 +311,7 @@ TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) { +TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16, _bf16)(op); } @@ -344,7 +355,7 @@ TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) { +TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32, _bf16)(op); } @@ -388,7 +399,7 @@ TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) { +TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64, _bf16)(op); } @@ -432,7 +443,7 @@ TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _s8)(op); } @@ -476,7 +487,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _s16)(op); } @@ -520,7 +531,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _s32)(op); } @@ -564,7 +575,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _s64)(op); } @@ -608,7 +619,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _u8)(op); } @@ -652,7 +663,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _u16)(op); } @@ -696,7 +707,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _u32)(op); } @@ -740,7 +751,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _u64)(op); } @@ -776,7 +787,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) { // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16)(op); } @@ -820,7 +831,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _f16)(op); } @@ -864,7 +875,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _f32)(op); } @@ -908,7 +919,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) { +TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_bf16, _f64)(op); } @@ -952,7 +963,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) { +TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32, _bf16)(op); } @@ -996,7 +1007,7 @@ TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) { +TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16, _bf16)(op); } @@ -1040,6 +1051,6 @@ TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) { // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) { +TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64, _bf16)(op); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c index 3d9d5c3ce45a..e61bbf3e03d7 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c @@ -4,6 +4,10 @@ // RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2 // RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3 // RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4 +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2 +// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3 +// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2 // RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3 @@ -17,9 +21,16 @@ // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include +#if defined __ARM_FEATURE_SME +#define MODE_ATTR __arm_streaming +#else +#define MODE_ATTR +#endif + #ifdef TUPLE #define TYPE_1(base,tuple) base ## tuple ## _t #define TYPE_0(base,tuple) TYPE_1(base,tuple) @@ -72,7 +83,7 @@ // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op) +TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_s8)(op); } @@ -117,7 +128,7 @@ TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op) +TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_s16)(op); } @@ -162,7 +173,7 @@ TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op) +TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_s32)(op); } @@ -207,7 +218,7 @@ TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op) +TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_s64)(op); } @@ -244,7 +255,7 @@ TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op) +TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_u8)(op); } @@ -289,7 +300,7 @@ TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op) +TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_u16)(op); } @@ -335,7 +346,7 @@ TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op) +TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_u32)(op); } @@ -381,7 +392,7 @@ TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op) +TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_u64)(op); } @@ -426,7 +437,7 @@ TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op) +TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_f16)(op); } @@ -471,7 +482,7 @@ TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op) +TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_f32)(op); } @@ -516,7 +527,7 @@ TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op) +TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s8,_f64)(op); } @@ -561,7 +572,7 @@ TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op) +TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_s8)(op); } @@ -598,7 +609,7 @@ TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op) +TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_s16)(op); } @@ -643,7 +654,7 @@ TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op) +TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_s32)(op); } @@ -688,7 +699,7 @@ TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op) +TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_s64)(op); } @@ -733,7 +744,7 @@ TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op) +TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_u8)(op); } @@ -770,7 +781,7 @@ TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op) +TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_u16)(op); } @@ -815,7 +826,7 @@ TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op) +TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_u32)(op); } @@ -860,7 +871,7 @@ TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op) +TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_u64)(op); } @@ -905,7 +916,7 @@ TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op) +TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_f16)(op); } @@ -950,7 +961,7 @@ TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op) +TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_f32)(op); } @@ -995,7 +1006,7 @@ TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op) +TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s16,_f64)(op); } @@ -1040,7 +1051,7 @@ TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op) +TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_s8)(op); } @@ -1085,7 +1096,7 @@ TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op) +TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_s16)(op); } @@ -1122,7 +1133,7 @@ TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op) +TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_s32)(op); } @@ -1167,7 +1178,7 @@ TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op) +TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_s64)(op); } @@ -1212,7 +1223,7 @@ TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op) +TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_u8)(op); } @@ -1257,7 +1268,7 @@ TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op) +TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_u16)(op); } @@ -1294,7 +1305,7 @@ TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op) +TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_u32)(op); } @@ -1339,7 +1350,7 @@ TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op) +TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_u64)(op); } @@ -1384,7 +1395,7 @@ TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op) +TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_f16)(op); } @@ -1429,7 +1440,7 @@ TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op) +TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_f32)(op); } @@ -1475,7 +1486,7 @@ TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op) +TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s32,_f64)(op); } @@ -1520,7 +1531,7 @@ TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op) +TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_s8)(op); } @@ -1565,7 +1576,7 @@ TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op) +TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_s16)(op); } @@ -1610,7 +1621,7 @@ TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op) +TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_s32)(op); } @@ -1647,7 +1658,7 @@ TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op) +TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_s64)(op); } @@ -1692,7 +1703,7 @@ TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op) +TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_u8)(op); } @@ -1737,7 +1748,7 @@ TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op) +TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_u16)(op); } @@ -1782,7 +1793,7 @@ TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op) +TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_u32)(op); } @@ -1819,7 +1830,7 @@ TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op) +TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_u64)(op); } @@ -1864,7 +1875,7 @@ TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op) +TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_f16)(op); } @@ -1909,7 +1920,7 @@ TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op) +TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_f32)(op); } @@ -1954,7 +1965,7 @@ TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op) +TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_s64,_f64)(op); } @@ -1991,7 +2002,7 @@ TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op) +TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_s8)(op); } @@ -2036,7 +2047,7 @@ TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op) +TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_s16)(op); } @@ -2081,7 +2092,7 @@ TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op) +TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_s32)(op); } @@ -2126,7 +2137,7 @@ TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op) +TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_s64)(op); } @@ -2163,7 +2174,7 @@ TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op) +TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_u8)(op); } @@ -2208,7 +2219,7 @@ TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op) +TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_u16)(op); } @@ -2253,7 +2264,7 @@ TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op) +TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_u32)(op); } @@ -2298,7 +2309,7 @@ TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op) +TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_u64)(op); } @@ -2343,7 +2354,7 @@ TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op) +TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_f16)(op); } @@ -2388,7 +2399,7 @@ TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op) +TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_f32)(op); } @@ -2433,7 +2444,7 @@ TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op) +TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u8,_f64)(op); } @@ -2478,7 +2489,7 @@ TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op) +TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_s8)(op); } @@ -2515,7 +2526,7 @@ TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op) +TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_s16)(op); } @@ -2560,7 +2571,7 @@ TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op) +TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_s32)(op); } @@ -2605,7 +2616,7 @@ TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op) +TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_s64)(op); } @@ -2650,7 +2661,7 @@ TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op) +TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_u8)(op); } @@ -2687,7 +2698,7 @@ TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op) +TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_u16)(op); } @@ -2732,7 +2743,7 @@ TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op) +TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_u32)(op); } @@ -2777,7 +2788,7 @@ TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op) +TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_u64)(op); } @@ -2822,7 +2833,7 @@ TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op) +TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_f16)(op); } @@ -2867,7 +2878,7 @@ TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op) +TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_f32)(op); } @@ -2912,7 +2923,7 @@ TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op) +TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u16,_f64)(op); } @@ -2957,7 +2968,7 @@ TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op) +TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_s8)(op); } @@ -3002,7 +3013,7 @@ TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op) +TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_s16)(op); } @@ -3039,7 +3050,7 @@ TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op) +TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_s32)(op); } @@ -3084,7 +3095,7 @@ TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op) +TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_s64)(op); } @@ -3129,7 +3140,7 @@ TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op) +TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_u8)(op); } @@ -3174,7 +3185,7 @@ TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op) +TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_u16)(op); } @@ -3211,7 +3222,7 @@ TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op) +TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_u32)(op); } @@ -3256,7 +3267,7 @@ TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op) +TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_u64)(op); } @@ -3301,7 +3312,7 @@ TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op) +TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_f16)(op); } @@ -3346,7 +3357,7 @@ TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op) +TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_f32)(op); } @@ -3391,7 +3402,7 @@ TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op) +TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u32,_f64)(op); } @@ -3436,7 +3447,7 @@ TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op) +TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_s8)(op); } @@ -3481,7 +3492,7 @@ TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op) +TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_s16)(op); } @@ -3526,7 +3537,7 @@ TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op) +TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_s32)(op); } @@ -3563,7 +3574,7 @@ TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op) +TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_s64)(op); } @@ -3608,7 +3619,7 @@ TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op) +TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_u8)(op); } @@ -3653,7 +3664,7 @@ TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op) +TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_u16)(op); } @@ -3698,7 +3709,7 @@ TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op) +TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_u32)(op); } @@ -3735,7 +3746,7 @@ TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op) +TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_u64)(op); } @@ -3780,7 +3791,7 @@ TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op) +TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_f16)(op); } @@ -3825,7 +3836,7 @@ TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op) +TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_f32)(op); } @@ -3870,7 +3881,7 @@ TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op) +TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_u64,_f64)(op); } @@ -3915,7 +3926,7 @@ TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op) +TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_s8)(op); } @@ -3960,7 +3971,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op) +TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_s16)(op); } @@ -4005,7 +4016,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op) +TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_s32)(op); } @@ -4050,7 +4061,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op) +TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_s64)(op); } @@ -4095,7 +4106,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op) +TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_u8)(op); } @@ -4140,7 +4151,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op) +TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_u16)(op); } @@ -4185,7 +4196,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op) +TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_u32)(op); } @@ -4230,7 +4241,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op) +TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_u64)(op); } @@ -4267,7 +4278,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op) +TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_f16)(op); } @@ -4312,7 +4323,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op) +TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_f32)(op); } @@ -4357,7 +4368,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op) +TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f16,_f64)(op); } @@ -4402,7 +4413,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op) +TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_s8)(op); } @@ -4447,7 +4458,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op) +TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_s16)(op); } @@ -4492,7 +4503,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op) +TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_s32)(op); } @@ -4537,7 +4548,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op) +TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_s64)(op); } @@ -4582,7 +4593,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op) +TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_u8)(op); } @@ -4627,7 +4638,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op) +TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_u16)(op); } @@ -4672,7 +4683,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op) +TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_u32)(op); } @@ -4717,7 +4728,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op) +TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_u64)(op); } @@ -4762,7 +4773,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op) +TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_f16)(op); } @@ -4799,7 +4810,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op) +TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_f32)(op); } @@ -4844,7 +4855,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op) +TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f32,_f64)(op); } @@ -4889,7 +4900,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op) +TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_s8)(op); } @@ -4934,7 +4945,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op) +TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_s16)(op); } @@ -4979,7 +4990,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op) +TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_s32)(op); } @@ -5024,7 +5035,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op) +TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_s64)(op); } @@ -5069,7 +5080,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op) +TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_u8)(op); } @@ -5114,7 +5125,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op) +TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_u16)(op); } @@ -5159,7 +5170,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op) +TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_u32)(op); } @@ -5204,7 +5215,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op) +TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_u64)(op); } @@ -5249,7 +5260,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op) +TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_f16)(op); } @@ -5294,7 +5305,7 @@ TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op) // CPP-TUPLE4-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to // CPP-TUPLE4-NEXT: ret [[TMP0]] // -TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op) +TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_f32)(op); } @@ -5331,7 +5342,7 @@ TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op) // CPP-TUPLE4-NEXT: entry: // CPP-TUPLE4-NEXT: ret [[OP:%.*]] // -TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op) +TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op) MODE_ATTR { return SVE_ACLE_FUNC(svreinterpret_f64,_f64)(op); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c deleted file mode 100644 index f27875836193..000000000000 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c +++ /dev/null @@ -1,35 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s - -// Note: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0. - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// Test that svreinterpret is inlined (because it should be streaming-compatible) -__attribute__((target("sme"))) -// CHECK-LABEL: @test_svreinterpret_s16_s8_from_streaming_mode( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z45test_svreinterpret_s16_s8_from_streaming_modeu10__SVInt8_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP:%.*]] to -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svint16_t test_svreinterpret_s16_s8_from_streaming_mode(svint8_t op) __arm_streaming { - return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op); -} - diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 3ddfd3277b68..e77d80623e84 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1385,17 +1385,14 @@ void SVEEmitter::createHeader(raw_ostream &OS) { SVEType ToV(To.BaseType, N); for (const ReinterpretTypeInfo &From : Reinterprets) { SVEType FromV(From.BaseType, N); - if (ShortForm) { - OS << "__aio __attribute__((target(\"sve\"))) " << ToV.str() - << " svreinterpret_" << To.Suffix; - OS << "(" << FromV.str() << " op) __arm_streaming_compatible {\n"; - OS << " return __builtin_sve_reinterpret_" << To.Suffix << "_" - << From.Suffix << Suffix << "(op);\n"; - OS << "}\n\n"; - } else - OS << "#define svreinterpret_" << To.Suffix << "_" << From.Suffix - << Suffix << "(...) __builtin_sve_reinterpret_" << To.Suffix - << "_" << From.Suffix << Suffix << "(__VA_ARGS__)\n"; + OS << "__aio " + "__attribute__((__clang_arm_builtin_alias(__builtin_sve_" + "reinterpret_" + << To.Suffix << "_" << From.Suffix << Suffix << ")))\n" + << ToV.str() << " svreinterpret_" << To.Suffix; + if (!ShortForm) + OS << "_" << From.Suffix << Suffix; + OS << "(" << FromV.str() << " op);\n"; } } } @@ -1453,7 +1450,7 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) { SVEType FromV(From.BaseType, N); OS << "TARGET_BUILTIN(__builtin_sve_reinterpret_" << To.Suffix << "_" << From.Suffix << Suffix << +", \"" << ToV.builtin_str() - << FromV.builtin_str() << "\", \"n\", \"sve\")\n"; + << FromV.builtin_str() << "\", \"n\", \"sme|sve\")\n"; } } } From 10dc3a8e916d73291269e5e2b82dd22681489aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 11:44:27 +0200 Subject: [PATCH 014/433] [clang][Interp] Fix empty InitListExprs for unions We still need to handle Inits.size() == 0, but we can do that earlier. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 3 +++ clang/test/AST/Interp/unions.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index e64d3a94b509..0c514236d4ca 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1053,6 +1053,9 @@ bool ByteCodeExprGen::visitInitList(ArrayRef Inits, if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) return this->visitInitializer(Inits[0]); + if (Inits.size() == 0) + return this->emitFinishInit(E); + auto initPrimitiveField = [=](const Record::Field *FieldToInit, const Expr *Init, PrimType T) -> bool { if (!this->visit(Init)) diff --git a/clang/test/AST/Interp/unions.cpp b/clang/test/AST/Interp/unions.cpp index 73e42d57a7b7..b0b1b1961740 100644 --- a/clang/test/AST/Interp/unions.cpp +++ b/clang/test/AST/Interp/unions.cpp @@ -42,4 +42,10 @@ namespace SimpleStore { return a.b; } static_assert(foo() == 10, ""); + + constexpr int empty() { + A a{}; /// Just test that this works. + return 10; + } + static_assert(empty() == 10, ""); } From 5e06050efdaa36bc63987b0e15c30b3cf358e70c Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 23 May 2024 11:53:19 +0200 Subject: [PATCH 015/433] [SCCP] Add tests for #93096 (NFC) --- llvm/test/Transforms/SCCP/range-with-undef.ll | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 llvm/test/Transforms/SCCP/range-with-undef.ll diff --git a/llvm/test/Transforms/SCCP/range-with-undef.ll b/llvm/test/Transforms/SCCP/range-with-undef.ll new file mode 100644 index 000000000000..444b47df5569 --- /dev/null +++ b/llvm/test/Transforms/SCCP/range-with-undef.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=ipsccp < %s | FileCheck %s + +; Make sure that constant ranges including undef are propagated correctly. +; FIXME: All of the following are currently miscompiled. + +define i8 @test_binop(i1 %cond, i8 %a) { +; CHECK-LABEL: define i8 @test_binop( +; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A_EXT:%.*]] = zext i8 [[A]] to i16 +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[PHI]], -1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i16 [[AND]] to i8 +; CHECK-NEXT: ret i8 [[TRUNC]] +; +entry: + br i1 %cond, label %if, label %join + +if: + %a.ext = zext i8 %a to i16 + br label %join + +join: + %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ] + %and = and i16 %phi, u0x0000ffff + %trunc = trunc i16 %and to i8 + ret i8 %trunc +} + +define i8 @test_cast(i1 %cond, i8 %a) { +; CHECK-LABEL: define i8 @test_cast( +; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A_EXT:%.*]] = zext i8 [[A]] to i16 +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[PHI]] to i32 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i32 [[ZEXT]] to i8 +; CHECK-NEXT: ret i8 [[TRUNC]] +; +entry: + br i1 %cond, label %if, label %join + +if: + %a.ext = zext i8 %a to i16 + br label %join + +join: + %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ] + %zext = zext i16 %phi to i32 + %trunc = trunc i32 %zext to i8 + ret i8 %trunc +} + +define i8 @test_intrin(i1 %cond, i8 %a) { +; CHECK-LABEL: define range(i8 42, 0) i8 @test_intrin( +; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A_EXT:%.*]] = zext i8 [[A]] to i16 +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ] +; CHECK-NEXT: [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[PHI]], i16 42) +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i16 [[UMAX]] to i8 +; CHECK-NEXT: ret i8 [[TRUNC]] +; +entry: + br i1 %cond, label %if, label %join + +if: + %a.ext = zext i8 %a to i16 + br label %join + +join: + %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ] + %umax = call i16 @llvm.umax(i16 %phi, i16 42) + %trunc = trunc i16 %umax to i8 + ret i8 %trunc +} + +define i9 @test_with_overflow(i1 %cond, i8 %a) { +; CHECK-LABEL: define range(i9 1, -255) i9 @test_with_overflow( +; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A_EXT:%.*]] = zext i8 [[A]] to i16 +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ] +; CHECK-NEXT: [[WO:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[PHI]], i16 1) +; CHECK-NEXT: [[ADD:%.*]] = extractvalue { i16, i1 } [[WO]], 0 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i16 [[ADD]] to i9 +; CHECK-NEXT: ret i9 [[TRUNC]] +; +entry: + br i1 %cond, label %if, label %join + +if: + %a.ext = zext i8 %a to i16 + br label %join + +join: + %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ] + %wo = call {i16, i1} @llvm.uadd.with.overflow(i16 %phi, i16 1) + %add = extractvalue {i16, i1} %wo, 0 + %trunc = trunc i16 %add to i9 + ret i9 %trunc +} From 45a95c3c543c88a669cffd3f7ee2a1b7e02b44e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 11:47:51 +0200 Subject: [PATCH 016/433] [clang][Interp] Fix DeclRefExprs of void-typed dummy pointers --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 2 ++ clang/test/AST/Interp/c.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 0c514236d4ca..b885cbe2c4b0 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -3795,6 +3795,8 @@ bool ByteCodeExprGen::VisitDeclRefExpr(const DeclRefExpr *E) { if (std::optional I = P.getOrCreateDummy(D)) { if (!this->emitGetPtrGlobal(*I, E)) return false; + if (E->getType()->isVoidType()) + return true; // Convert the dummy pointer to another pointer type if we have to. if (PrimType PT = classifyPrim(E); PT != PT_Ptr) { if (!this->emitDecayPtr(PT_Ptr, PT, E)) diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c index 2a75457a4693..d680dbc912ab 100644 --- a/clang/test/AST/Interp/c.c +++ b/clang/test/AST/Interp/c.c @@ -278,3 +278,9 @@ void addrlabelexpr(void) { a0: ; static void *ps[] = { &&a0 }; // pedantic-warning {{use of GNU address-of-label extension}} } + +extern void cv2; +void *foo5 (void) +{ + return &cv2; // pedantic-warning{{address of an expression of type 'void'}} +} From 4d9e7b14e45120557e57da2f00f6d23cf122dd95 Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Thu, 23 May 2024 13:59:02 +0400 Subject: [PATCH 017/433] [lldb][Windows] Fixed the TestBreakpointCommand test (#93122) The TestBreakpointCommand test is incorrectly disabled for Windows target. We can disable it for Windows host instead or just fix the issue. This patch fixes the path separator in BreakpointResolverFileLine::DeduceSourceMapping() and the Windows specific absolute path in the test in case of the Windows host. --- .../Breakpoint/BreakpointResolverFileLine.cpp | 10 +++++----- .../TestBreakpointCommand.py | 18 +++++++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp index d7d8c714867e..16c4ee1b88d1 100644 --- a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp +++ b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp @@ -198,16 +198,16 @@ void BreakpointResolverFileLine::DeduceSourceMapping( return; Log *log = GetLog(LLDBLog::Breakpoints); - const llvm::StringRef path_separator = llvm::sys::path::get_separator( - m_location_spec.GetFileSpec().GetPathStyle()); // Check if "b" is a suffix of "a". // And return std::nullopt if not or the new path // of "a" after consuming "b" from the back. auto check_suffix = - [path_separator](llvm::StringRef a, llvm::StringRef b, - bool case_sensitive) -> std::optional { + [](llvm::StringRef a, llvm::StringRef b, + bool case_sensitive) -> std::optional { if (case_sensitive ? a.consume_back(b) : a.consume_back_insensitive(b)) { - if (a.empty() || a.ends_with(path_separator)) { + // Note sc_file_dir and request_file_dir below are normalized + // and always contain the path separator '/'. + if (a.empty() || a.ends_with("/")) { return a; } } diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py index c219a4ee5bd9..605561c75737 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py @@ -6,7 +6,7 @@ import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil +from lldbsuite.test import lldbutil, lldbplatformutil import json import os import side_effect @@ -581,7 +581,6 @@ def verify_source_map_deduce_statistics(self, target, expected_count): self.assertNotEqual(target_stats, None) self.assertEqual(target_stats["sourceMapDeduceCount"], expected_count) - @skipIf(oslist=["windows"]) @no_debug_info_test def test_breakpoints_auto_source_map_relative(self): """ @@ -612,8 +611,13 @@ def test_breakpoints_auto_source_map_relative(self): self.verify_source_map_deduce_statistics(target, 0) # Verify auto deduced source map when file path in debug info - # is a suffix of request breakpoint file path - path = "/x/y/a/b/c/main.cpp" + # is a suffix of request breakpoint file path. + # Note the path must be absolute. + path = ( + "/x/y/a/b/c/main.cpp" + if lldbplatformutil.getHostPlatform() != "windows" + else r"C:\x\y\a\b\c\main.cpp" + ) bp = target.BreakpointCreateByLocation(path, 2) self.assertGreater( bp.GetNumLocations(), @@ -625,7 +629,11 @@ def test_breakpoints_auto_source_map_relative(self): source_map_json = self.get_source_map_json() self.assertEqual(len(source_map_json), 1, "source map should not be empty") - self.verify_source_map_entry_pair(source_map_json[0], ".", "/x/y") + self.verify_source_map_entry_pair( + source_map_json[0], + ".", + "/x/y" if lldbplatformutil.getHostPlatform() != "windows" else r"C:\x\y", + ) self.verify_source_map_deduce_statistics(target, 1) # Reset source map. From e37da2cef7ea44f6aa52dd37978b287a0741c39f Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Thu, 23 May 2024 14:01:10 +0400 Subject: [PATCH 018/433] [lldb] Fixed the TestExitDuringExpression test in case of a remote target (#93119) Sometimes this test failed on the assert `The thread exited` in case of a remote target. Increase the timeout to 1 second to avoid a racing condition. --- .../API/functionalities/thread/exit_during_expression/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/thread/exit_during_expression/main.c b/lldb/test/API/functionalities/thread/exit_during_expression/main.c index eb6d17520986..f633632e96cc 100644 --- a/lldb/test/API/functionalities/thread/exit_during_expression/main.c +++ b/lldb/test/API/functionalities/thread/exit_during_expression/main.c @@ -3,7 +3,7 @@ #include #include -static unsigned int g_timeout = 100000; +static unsigned int g_timeout = 1000000; extern int usleep(unsigned int); From aefd2572a504d675ef623d2f3d61364232b19f26 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 May 2024 11:07:35 +0100 Subject: [PATCH 019/433] [DAG][X86] expandABD - add branchless abds/abdu expansion for 0/-1 comparison result cases (#92780) If the comparison results are allbits masks, we can expand as `abd(lhs, rhs) -> sub(cmpgt(lhs, rhs), xor(sub(lhs, rhs), cmpgt(lhs, rhs)))`, replacing a sub+sub+select pattern with the simpler sub+xor+sub pattern. This allows us to remove a lot of X86 specific legalization code, and will be useful in future generic expansion for the legalization work in #92576 Alive2: https://alive2.llvm.org/ce/z/sj863C --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 14 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 32 +- llvm/test/CodeGen/X86/abds-vector-128.ll | 350 ++++----- llvm/test/CodeGen/X86/abds-vector-256.ll | 72 +- llvm/test/CodeGen/X86/abdu-vector-128.ll | 280 ++++---- llvm/test/CodeGen/X86/abdu-vector-256.ll | 72 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 669 ++++++++---------- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 154 ++-- 8 files changed, 741 insertions(+), 902 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 85bd45a88542..37c72339fe29 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9228,11 +9228,21 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS), DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS)); - // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) - // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT; SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC); + + // Branchless expansion iff cmp result is allbits: + // abds(lhs, rhs) -> sub(sgt(lhs, rhs), xor(sgt(lhs, rhs), sub(lhs, rhs))) + // abdu(lhs, rhs) -> sub(ugt(lhs, rhs), xor(ugt(lhs, rhs), sub(lhs, rhs))) + if (CCVT == VT && getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) { + SDValue Diff = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); + SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Diff, Cmp); + return DAG.getNode(ISD::SUB, dl, VT, Cmp, Xor); + } + + // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS), DAG.getNode(ISD::SUB, dl, VT, RHS, LHS)); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 37c591f90f0a..215cbc308e43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1108,13 +1108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } - setOperationAction(ISD::ABDU, MVT::v16i8, Custom); - setOperationAction(ISD::ABDS, MVT::v16i8, Custom); - setOperationAction(ISD::ABDU, MVT::v8i16, Custom); - setOperationAction(ISD::ABDS, MVT::v8i16, Custom); - setOperationAction(ISD::ABDU, MVT::v4i32, Custom); - setOperationAction(ISD::ABDS, MVT::v4i32, Custom); - setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); @@ -1132,9 +1125,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::ABDS, VT, Custom); + setOperationAction(ISD::ABDU, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1336,11 +1331,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); - for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { - setOperationAction(ISD::ABDS, VT, Custom); - setOperationAction(ISD::ABDU, VT, Custom); - } - setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom); @@ -28421,18 +28411,6 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, } } - // TODO: Move to TargetLowering expandABD(). - if (!Subtarget.hasSSE41() && - ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) { - SDValue LHS = DAG.getFreeze(Op.getOperand(0)); - SDValue RHS = DAG.getFreeze(Op.getOperand(1)); - ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT; - SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC); - SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); - SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS); - return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG); - } - // Default to expand. return SDValue(); } diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll index 3143bf619065..bcb42002fb08 100644 --- a/llvm/test/CodeGen/X86/abds-vector-128.ll +++ b/llvm/test/CodeGen/X86/abds-vector-128.ll @@ -12,14 +12,12 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: abd_ext_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v16i8: @@ -47,14 +45,12 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: abd_ext_v16i8_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v16i8_undef: @@ -128,14 +124,12 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-LABEL: abd_ext_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v4i32: @@ -163,14 +157,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-LABEL: abd_ext_v4i32_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v4i32_undef: @@ -198,61 +190,48 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_ext_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: movq %rdi, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %r9, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: subq %r9, %rdx -; SSE2-NEXT: sbbq %r10, %rsi -; SSE2-NEXT: subq %rdi, %rax -; SSE2-NEXT: sbbq %r8, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: subq %rcx, %rax -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: subq %rsi, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_ext_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_ext_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v2i64: @@ -272,61 +251,48 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_ext_v2i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: movq %rdi, %r8 -; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %r9, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: subq %r9, %rdx -; SSE2-NEXT: sbbq %r10, %rsi -; SSE2-NEXT: subq %rdi, %rax -; SSE2-NEXT: sbbq %r8, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: subq %rcx, %rax -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: subq %rsi, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v2i64_undef: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_ext_v2i64_undef: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_ext_v2i64_undef: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v2i64_undef: @@ -350,14 +316,12 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: abd_minmax_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_minmax_v16i8: @@ -404,14 +368,12 @@ define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-LABEL: abd_minmax_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_minmax_v4i32: @@ -445,47 +407,40 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_minmax_v2i64: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_minmax_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_minmax_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_minmax_v2i64: @@ -507,14 +462,12 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: abd_cmp_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_cmp_v16i8: @@ -563,14 +516,12 @@ define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-LABEL: abd_cmp_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_cmp_v4i32: @@ -598,9 +549,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_cmp_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -609,12 +560,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -622,28 +570,26 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_cmp_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_cmp_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_cmp_v2i64: @@ -790,50 +736,52 @@ define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_cmp_v2i64_multiuse_cmp: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psubq %xmm2, %xmm4 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm3 +; SSE42-NEXT: psubq %xmm3, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm0, %xmm1 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm0 -; SSE42-NEXT: paddq %xmm4, %xmm0 +; SSE42-NEXT: paddq %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm4 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -844,8 +792,8 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm4 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpsubq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/abds-vector-256.ll b/llvm/test/CodeGen/X86/abds-vector-256.ll index 78190d2cb7d8..cc63ad04c08a 100644 --- a/llvm/test/CodeGen/X86/abds-vector-256.ll +++ b/llvm/test/CodeGen/X86/abds-vector-256.ll @@ -223,22 +223,22 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_ext_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v4i64: @@ -261,22 +261,22 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_ext_v4i64_undef: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v4i64_undef: @@ -402,22 +402,22 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_minmax_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_minmax_v4i64: @@ -544,22 +544,22 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_cmp_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_cmp_v4i64: diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll index 0c33e8973c2d..78b315a3773e 100644 --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -125,12 +125,10 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm3 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v4i32: @@ -163,12 +161,10 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm3 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v4i32_undef: @@ -196,27 +192,22 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_ext_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %rsi -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: subq %rsi, %rcx -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: sbbq %rsi, %rsi -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: sbbq %rdi, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: xorq %rdi, %rax -; SSE2-NEXT: subq %rdi, %rax -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: xorq %rsi, %rcx -; SSE2-NEXT: subq %rsi, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v2i64: @@ -226,12 +217,10 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE42-NEXT: pxor %xmm2, %xmm3 ; SSE42-NEXT: pxor %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_ext_v2i64: @@ -241,9 +230,9 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_ext_v2i64: @@ -252,9 +241,9 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v2i64: @@ -274,27 +263,22 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_ext_v2i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %rsi -; SSE2-NEXT: xorl %edi, %edi -; SSE2-NEXT: subq %rsi, %rcx -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: sbbq %rsi, %rsi -; SSE2-NEXT: subq %rdx, %rax -; SSE2-NEXT: sbbq %rdi, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: xorq %rdi, %rax -; SSE2-NEXT: subq %rdi, %rax -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: xorq %rsi, %rcx -; SSE2-NEXT: subq %rsi, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_ext_v2i64_undef: @@ -304,12 +288,10 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE42-NEXT: pxor %xmm2, %xmm3 ; SSE42-NEXT: pxor %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_ext_v2i64_undef: @@ -319,9 +301,9 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_ext_v2i64_undef: @@ -330,9 +312,9 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v2i64_undef: @@ -411,12 +393,10 @@ define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm3 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_minmax_v4i32: @@ -450,19 +430,14 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_minmax_v2i64: @@ -472,12 +447,10 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE42-NEXT: pxor %xmm2, %xmm3 ; SSE42-NEXT: pxor %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_minmax_v2i64: @@ -487,9 +460,9 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_minmax_v2i64: @@ -498,9 +471,9 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_minmax_v2i64: @@ -579,12 +552,10 @@ define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm3 ; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_cmp_v4i32: @@ -612,9 +583,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: abd_cmp_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -623,12 +594,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -639,12 +607,10 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE42-NEXT: pxor %xmm2, %xmm3 ; SSE42-NEXT: pxor %xmm0, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm3, %xmm2 -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psubq %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm2, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 ; SSE42-NEXT: movdqa %xmm2, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: movapd %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_cmp_v2i64: @@ -654,9 +620,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_cmp_v2i64: @@ -665,9 +631,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_cmp_v2i64: @@ -692,63 +658,59 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: paddq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: psubq %xmm1, %xmm2 -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: psubq %xmm0, %xmm3 -; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; SSE42-NEXT: pxor %xmm4, %xmm1 -; SSE42-NEXT: pxor %xmm4, %xmm0 +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: pxor %xmm3, %xmm1 +; SSE42-NEXT: pxor %xmm3, %xmm0 ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: paddq %xmm3, %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: psubq %xmm2, %xmm1 +; SSE42-NEXT: paddq %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm1 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll index 884515cfedd0..080fb779fecb 100644 --- a/llvm/test/CodeGen/X86/abdu-vector-256.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll @@ -227,15 +227,15 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -245,9 +245,9 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v4i64: @@ -274,15 +274,15 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -292,9 +292,9 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_ext_v4i64_undef: @@ -424,15 +424,15 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -442,9 +442,9 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_minmax_v4i64: @@ -575,15 +575,15 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -593,9 +593,9 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: abd_cmp_v4i64: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index c6e8b7532505..3b5ff12fb4ec 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -31,10 +31,8 @@ define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwi ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psubd %xmm1, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: psubd %xmm4, %xmm2 ; SSE2-NEXT: psrld $1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 @@ -179,25 +177,22 @@ define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) noun ; SSE2-LABEL: vec128_i32_unsigned_reg_reg: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm1, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubd %xmm3, %xmm2 ; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -349,10 +344,8 @@ define <4 x i32> @vec128_i32_signed_mem_reg(ptr %a1_addr, <4 x i32> %a2) nounwin ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: psubd %xmm4, %xmm2 ; SSE2-NEXT: psrld $1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 @@ -511,10 +504,8 @@ define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, ptr %a2_addr) nounwin ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psubd %xmm1, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: psubd %xmm4, %xmm2 ; SSE2-NEXT: psrld $1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 @@ -674,10 +665,8 @@ define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: psubd %xmm4, %xmm2 ; SSE2-NEXT: psrld $1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 @@ -844,74 +833,66 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; SSE2-LABEL: vec128_i64_signed_reg_reg: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: psrlq $33, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubq %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psrlq $1, %xmm3 +; SSE2-NEXT: psrlq $33, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: paddq %xmm2, %xmm4 ; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_reg_reg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1] -; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psubq %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psubq %xmm1, %xmm4 -; SSE41-NEXT: psubq %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $33, %xmm1 -; SSE41-NEXT: pmuludq %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: psubq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: psrlq $33, %xmm2 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrlq $32, %xmm4 -; SSE41-NEXT: pmuludq %xmm0, %xmm4 -; SSE41-NEXT: paddq %xmm1, %xmm4 +; SSE41-NEXT: pmuludq %xmm3, %xmm4 +; SSE41-NEXT: paddq %xmm2, %xmm4 ; SSE41-NEXT: psllq $32, %xmm4 -; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm3 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -919,9 +900,9 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpsrlq $1, %xmm1, %xmm2 ; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -938,9 +919,9 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; XOP: # %bb.0: ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1027,74 +1008,66 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; SSE2-LABEL: vec128_i64_unsigned_reg_reg: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: psrlq $33, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubq %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psrlq $1, %xmm3 +; SSE2-NEXT: psrlq $33, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: paddq %xmm2, %xmm4 ; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_unsigned_reg_reg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1] -; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psubq %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psubq %xmm1, %xmm4 -; SSE41-NEXT: psubq %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $33, %xmm1 -; SSE41-NEXT: pmuludq %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: psubq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: psrlq $33, %xmm2 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrlq $32, %xmm4 -; SSE41-NEXT: pmuludq %xmm0, %xmm4 -; SSE41-NEXT: paddq %xmm1, %xmm4 +; SSE41-NEXT: pmuludq %xmm3, %xmm4 +; SSE41-NEXT: paddq %xmm2, %xmm4 ; SSE41-NEXT: psllq $32, %xmm4 -; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm3 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1106,9 +1079,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1128,9 +1101,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 ; AVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1147,9 +1120,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; XOP: # %bb.0: ; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1239,76 +1212,67 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubq %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $33, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm0, %xmm4 -; SSE2-NEXT: paddq %xmm3, %xmm4 -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: psrlq $33, %xmm2 +; SSE2-NEXT: pmuludq %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm3 +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 +; SSE2-NEXT: pmuludq %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_mem_reg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: psubq %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm6 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm3 -; SSE41-NEXT: psubq %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] +; SSE41-NEXT: por %xmm2, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: psubq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $33, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm1 -; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: psrlq $33, %xmm2 +; SSE41-NEXT: pmuludq %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: psrlq $32, %xmm3 ; SSE41-NEXT: pmuludq %xmm0, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: paddq %xmm2, %xmm3 ; SSE41-NEXT: psllq $32, %xmm3 -; SSE41-NEXT: pmuludq %xmm6, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -1317,9 +1281,9 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpsrlq $1, %xmm0, %xmm2 ; AVX-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 @@ -1337,9 +1301,9 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; XOP-NEXT: vmovdqa (%rdi), %xmm1 ; XOP-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2 ; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vblendvpd %xmm2, %xmm4, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm2 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 ; XOP-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 @@ -1442,15 +1406,10 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: psrlq $1, %xmm1 ; SSE2-NEXT: psrlq $33, %xmm3 @@ -1467,39 +1426,37 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; ; SSE41-LABEL: vec128_i64_signed_reg_mem: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm6 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psubq %xmm2, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $33, %xmm2 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: psrlq $32, %xmm3 -; SSE41-NEXT: pmuludq %xmm0, %xmm3 -; SSE41-NEXT: paddq %xmm2, %xmm3 -; SSE41-NEXT: psllq $32, %xmm3 -; SSE41-NEXT: pmuludq %xmm6, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm2, %xmm3 +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [1,1] +; SSE41-NEXT: por %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psubq %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: psubq %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: psrlq $1, %xmm1 +; SSE41-NEXT: psrlq $33, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrlq $32, %xmm4 +; SSE41-NEXT: pmuludq %xmm1, %xmm4 +; SSE41-NEXT: paddq %xmm3, %xmm4 +; SSE41-NEXT: psllq $32, %xmm4 +; SSE41-NEXT: pmuludq %xmm2, %xmm1 ; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: vec128_i64_signed_reg_mem: @@ -1507,9 +1464,9 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpsrlq $1, %xmm1, %xmm2 ; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1527,9 +1484,9 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; XOP-NEXT: vmovdqa (%rdi), %xmm1 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1620,75 +1577,67 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubq %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $33, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm0, %xmm4 -; SSE2-NEXT: paddq %xmm3, %xmm4 -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: psrlq $33, %xmm2 +; SSE2-NEXT: pmuludq %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm3 +; SSE2-NEXT: pmuludq %xmm0, %xmm3 +; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 +; SSE2-NEXT: pmuludq %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_mem_mem: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa (%rsi), %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: movdqa (%rsi), %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psubq %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm3, %xmm6 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psubq %xmm2, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] +; SSE41-NEXT: por %xmm2, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: psubq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: psrlq $33, %xmm2 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pmuludq %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: psrlq $32, %xmm3 ; SSE41-NEXT: pmuludq %xmm0, %xmm3 ; SSE41-NEXT: paddq %xmm2, %xmm3 ; SSE41-NEXT: psllq $32, %xmm3 -; SSE41-NEXT: pmuludq %xmm6, %xmm0 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq @@ -1699,9 +1648,9 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX-NEXT: vmovdqa (%rsi), %xmm1 ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpsrlq $1, %xmm1, %xmm2 ; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -1720,9 +1669,9 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vmovdqa (%rsi), %xmm1 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 @@ -2389,10 +2338,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psubb %xmm1, %xmm4 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: psubb %xmm4, %xmm3 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 @@ -2852,10 +2799,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psubb %xmm1, %xmm4 -; SSE2-NEXT: psubb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: psubb %xmm4, %xmm3 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm1 @@ -3083,30 +3028,28 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind { ; SSE2-LABEL: vec128_i8_signed_reg_mem: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubb %xmm3, %xmm4 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psubb %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: psubb %xmm4, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: packuswb %xmm4, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -3321,30 +3264,28 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE2-LABEL: vec128_i8_signed_mem_mem: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psubb %xmm3, %xmm4 -; SSE2-NEXT: psubb %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psubb %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: psubb %xmm4, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index cc08396ae8c7..92060aec3074 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -390,12 +390,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 @@ -427,9 +427,9 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 @@ -448,12 +448,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 @@ -561,25 +561,25 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 +; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 ; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 ; AVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -601,9 +601,9 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 @@ -622,12 +622,12 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpcomgtuq %xmm2, %xmm3, %xmm4 ; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 @@ -732,12 +732,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm6 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 @@ -770,9 +770,9 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm4 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 ; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 @@ -792,12 +792,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 ; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 ; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm6 -; XOP-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vblendvpd %xmm5, %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm6 -; XOP-NEXT: vpsubq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 @@ -902,12 +902,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 @@ -940,9 +940,9 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 @@ -962,12 +962,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpcomgtq %xmm2, %xmm3, %xmm4 ; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm5 -; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm6 -; XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpsubq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm5, %xmm1 +; XOP-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpsubq %xmm2, %xmm4, %xmm2 ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 @@ -1073,12 +1073,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm6 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 @@ -1112,9 +1112,9 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 @@ -1135,12 +1135,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 ; XOP-NEXT: vpcomgtq %xmm1, %xmm3, %xmm4 ; XOP-NEXT: vpcomgtq %xmm0, %xmm2, %xmm5 -; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm6 -; XOP-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vblendvpd %xmm5, %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm6 -; XOP-NEXT: vpsubq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpsubq %xmm0, %xmm5, %xmm0 +; XOP-NEXT: vpsubq %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpsubq %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 From 7c137f7e510c0fcc1bfa46f8c85063c2a2b190dd Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Thu, 23 May 2024 12:09:51 +0200 Subject: [PATCH 020/433] [mlir][nvvm] Remove unused check-ptx (#93147) The test used the check generated ptx with `CHECK-PTX`, but does not check that anymore. The PR removes these lines. --- .../GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir index 2e59b7234e53..391fda82e1e1 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir @@ -6,15 +6,6 @@ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s -// Basic PTX check to make sure we are generating the right instructions. - -// CHECK-PTX: mbarrier.init.shared.b64 -// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 -// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes -// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes -// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 -// CHECK-PTX: mbarrier.try_wait.parity.shared.b64 - // RUN: mlir-opt %s --convert-nvgpu-to-nvvm \ // RUN: -gpu-kernel-outlining \ // RUN: -convert-nvvm-to-llvm \ From 31f1590e4fb324c43dc36199587c453e27b6f6fa Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 23 May 2024 18:09:33 +0800 Subject: [PATCH 021/433] [Coroutines] Always set the calling convention of generated resuming call from 'llvm.coro.await.suspend.handle' as fast See the post commit message in https://github.com/llvm/llvm-project/pull/89751 We met a regression due to a change of calling convention of this patch. Previously, the calling convention of indirect resume calls is always fast. And in this patch, although we tried to take care of it in the cloner, we forget the case that we have to update the resuming calls in the ramp functions. So this is the root cause of the downstream failure. This patch tries to mark the generated resuming calls as fast immediately after they got created to make sure the calling convention is correct. --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 2 +- .../coro-await-suspend-handle-in-ramp.ll | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 1d9cf185b75a..5a58a99d2879 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -227,6 +227,7 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB, FunctionType *ResumeTy = FunctionType::get( Type::getVoidTy(Ctx), PointerType::getUnqual(Ctx), false); auto *ResumeCall = Builder.CreateCall(ResumeTy, ResumeAddr, {NewCall}); + ResumeCall->setCallingConv(CallingConv::Fast); // We can't insert the 'ret' instruction and adjust the cc until the // function has been split, so remember this for later. @@ -1088,7 +1089,6 @@ void CoroCloner::create() { // Turn symmetric transfers into musttail calls. for (CallInst *ResumeCall : Shape.SymmetricTransfers) { ResumeCall = cast(VMap[ResumeCall]); - ResumeCall->setCallingConv(NewF->getCallingConv()); if (TTI.supportsTailCallFor(ResumeCall)) { // FIXME: Could we support symmetric transfer effectively without // musttail? diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll new file mode 100644 index 000000000000..85e8bb52fee3 --- /dev/null +++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll @@ -0,0 +1,59 @@ +; Tests lowerings of different versions of coro.await.suspend +; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split),simplifycfg' -S | FileCheck %s + +%Awaiter = type {} + +define void @f() presplitcoroutine { +entry: + %awaiter = alloca %Awaiter + %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) + %size = call i32 @llvm.coro.size.i32() + %alloc = call ptr @malloc(i32 %size) + %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc) + call void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle) + %suspend.init = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %suspend.init, label %ret [ + i8 0, label %step + i8 1, label %cleanup + ] + +; Check the calling convention for resuming function is fastcc +; CHECK: define {{[^@]*}} @f() +; CHECK: entry: +; CHECK: %[[NEXT_HDL:.+]] = call ptr @await_suspend_wrapper_handle( +; CHECK-NEXT: %[[CONT:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[NEXT_HDL]], i8 0) +; CHECK-NEXT: musttail call fastcc void %[[CONT]](ptr %[[NEXT_HDL]]) +step: + br label %cleanup + +cleanup: + %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) + call void @free(ptr %mem) + br label %ret + +ret: + call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + ret void +} + +; check that we were haven't accidentally went out of @f body +; CHECK-LABEL: @f.resume( +; CHECK-LABEL: @f.destroy( +; CHECK-LABEL: @f.cleanup( + +declare ptr @await_suspend_wrapper_handle(ptr, ptr) + +declare ptr @llvm.coro.free(token, ptr) +declare i32 @llvm.coro.size.i32() +declare i8 @llvm.coro.suspend(token, i1) +declare void @llvm.coro.resume(ptr) +declare void @llvm.coro.destroy(ptr) + +declare token @llvm.coro.id(i32, ptr, ptr, ptr) +declare i1 @llvm.coro.alloc(token) +declare ptr @llvm.coro.begin(token, ptr) +declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr) +declare i1 @llvm.coro.end(ptr, i1, token) + +declare noalias ptr @malloc(i32) +declare void @free(ptr) From d7c37130008374341e79c355ad85cc48942136ff Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 23 May 2024 12:26:24 +0200 Subject: [PATCH 022/433] [AMDGPU] Add AMDGPU-specific module splitting (#89245) This enables the --lto-partitions option to work more consistently. This module splitting logic is fully aware of AMDGPU modules and their specificities and takes advantage of them to split modules in a way that avoids compilation issue (such as resource usage being incorrectly represented). This also includes a logging system that's more elaborate than just LLVM_DEBUG which allows printing logs to uniquely named files, and optionally with all value names hidden so they can be safely shared without leaking informatiton about the source. Logs can also be enabled through an environment variable, which avoids the sometimes complicated process of passing a -mllvm option all the way from clang driver to the offload linker that handles full LTO codegen. --- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 744 ++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h | 30 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 + llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 4 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + .../address-taken-externalize-with-call.ll | 46 ++ .../AMDGPU/address-taken-externalize.ll | 37 + .../llvm-split/AMDGPU/debug-name-hiding.ll | 20 + .../AMDGPU/kernels-alias-dependencies.ll | 45 ++ .../llvm-split/AMDGPU/kernels-cost-ranking.ll | 54 ++ .../llvm-split/AMDGPU/kernels-dependencies.ll | 50 ++ .../AMDGPU/kernels-dependency-duplication.ll | 41 + .../AMDGPU/kernels-dependency-external.ll | 64 ++ .../AMDGPU/kernels-dependency-indirect.ll | 76 ++ .../AMDGPU/kernels-dependency-overridable.ll | 40 + .../kernels-global-variables-noexternal.ll | 42 + .../AMDGPU/kernels-global-variables.ll | 44 ++ .../AMDGPU/kernels-load-balancing.ll | 75 ++ .../AMDGPU/kernels-no-dependencies.ll | 39 + .../AMDGPU/large-kernels-merging.ll | 98 +++ .../tools/llvm-split/AMDGPU/lit.local.cfg | 2 + 21 files changed, 1560 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h create mode 100644 llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll create mode 100644 llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp new file mode 100644 index 000000000000..56e275ce707b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -0,0 +1,744 @@ +//===- AMDGPUSplitModule.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Implements a module splitting algorithm designed to support the +/// FullLTO --lto-partitions option for parallel codegen. This is completely +/// different from the common SplitModule pass, as this system is designed with +/// AMDGPU in mind. +/// +/// The basic idea of this module splitting implementation is the same as +/// SplitModule: load-balance the module's functions across a set of N +/// partitions to allow parallel codegen. However, it does it very +/// differently than the target-agnostic variant: +/// - Kernels are used as the module's "roots". +/// They're known entry points on AMDGPU, and everything else is often +/// internal only. +/// - Each kernel has a set of dependencies, and when a kernel and its +/// dependencies is considered "big", we try to put it in a partition where +/// most dependencies are already imported, to avoid duplicating large +/// amounts of code. +/// - There's special care for indirect calls in order to ensure +/// AMDGPUResourceUsageAnalysis can work correctly. +/// +/// This file also includes a more elaborate logging system to enable +/// users to easily generate logs that (if desired) do not include any value +/// names, in order to not leak information about the source file. +/// Such logs are very helpful to understand and fix potential issues with +/// module splitting. + +#include "AMDGPUSplitModule.h" +#include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/SHA256.h" +#include "llvm/Support/Threading.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-split-module" + +namespace { + +static cl::opt LargeKernelFactor( + "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), + cl::Hidden, + cl::desc( + "consider a kernel as large and needing special treatment when it " + "exceeds the average cost of a partition by this factor; e;g. 2.0 " + "means if the kernel and its dependencies is 2 times bigger than " + "an average partition; 0 disables large kernels handling entirely")); + +static cl::opt LargeKernelOverlapForMerge( + "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), + cl::Hidden, + cl::desc("defines how much overlap between two large kernel's dependencies " + "is needed to put them in the same partition")); + +static cl::opt NoExternalizeGlobals( + "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, + cl::desc("disables externalization of global variable with local linkage; " + "may cause globals to be duplicated which increases binary size")); + +static cl::opt + LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, + cl::desc("output directory for AMDGPU module splitting logs")); + +static cl::opt + LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, + cl::desc("hash value names before printing them in the AMDGPU " + "module splitting logs")); + +using CostType = InstructionCost::CostType; +using PartitionID = unsigned; + +static bool isEntryPoint(const Function *F) { + return AMDGPU::isEntryFunctionCC(F->getCallingConv()); +} + +static std::string getName(const Value &V) { + static bool HideNames; + + static llvm::once_flag HideNameInitFlag; + llvm::call_once(HideNameInitFlag, [&]() { + if (LogPrivate.getNumOccurrences()) + HideNames = LogPrivate; + else { + const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); + HideNames = (EV.value_or("0") != "0"); + } + }); + + if (!HideNames) + return V.getName().str(); + return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), + /*LowerCase=*/true); +} + +/// Main logging helper. +/// +/// Logging can be configured by the following environment variable. +/// AMD_SPLIT_MODULE_LOG_DIR= +/// If set, uses as the directory to write logfiles to +/// each time module splitting is used. +/// AMD_SPLIT_MODULE_LOG_PRIVATE +/// If set to anything other than zero, all names are hidden. +/// +/// Both environment variables have corresponding CL options which +/// takes priority over them. +/// +/// Any output printed to the log files is also printed to dbgs() when -debug is +/// used and LLVM_DEBUG is defined. +/// +/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic +/// cannot be removed from the code (by building without debug). This probably +/// has a small performance cost because if some computation/formatting is +/// needed for logging purpose, it may be done everytime only to be ignored +/// by the logger. +/// +/// As this pass only runs once and is not doing anything computationally +/// expensive, this is likely a reasonable trade-off. +/// +/// If some computation should really be avoided when unused, users of the class +/// can check whether any logging will occur by using the bool operator. +/// +/// \code +/// if (SML) { +/// // Executes only if logging to a file or if -debug is available and +/// used. +/// } +/// \endcode +class SplitModuleLogger { +public: + SplitModuleLogger(const Module &M) { + std::string LogDir = LogDirOpt; + if (LogDir.empty()) + LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); + + // No log dir specified means we don't need to log to a file. + // We may still log to dbgs(), though. + if (LogDir.empty()) + return; + + // If a log directory is specified, create a new file with a unique name in + // that directory. + int Fd; + SmallString<0> PathTemplate; + SmallString<0> RealPath; + sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); + if (auto Err = + sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { + report_fatal_error("Failed to create log file at '" + Twine(LogDir) + + "': " + Err.message(), + /*CrashDiag=*/false); + } + + FileOS = std::make_unique(Fd, /*shouldClose=*/true); + } + + bool hasLogFile() const { return FileOS != nullptr; } + + raw_ostream &logfile() { + assert(FileOS && "no logfile!"); + return *FileOS; + } + + /// \returns true if this SML will log anything either to a file or dbgs(). + /// Can be used to avoid expensive computations that are ignored when logging + /// is disabled. + operator bool() const { + return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); + } + +private: + std::unique_ptr FileOS; +}; + +template +static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { + static_assert( + !std::is_same_v, + "do not print values to logs directly, use handleName instead!"); + LLVM_DEBUG(dbgs() << Val); + if (SML.hasLogFile()) + SML.logfile() << Val; + return SML; +} + +/// Calculate the cost of each function in \p M +/// \param SML Log Helper +/// \param TM TargetMachine instance used to retrieve TargetTransformInfo. +/// \param M Module to analyze. +/// \param CostMap[out] Resulting Function -> Cost map. +/// \return The module's total cost. +static CostType +calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, + Module &M, + DenseMap &CostMap) { + CostType ModuleCost = 0; + CostType KernelCost = 0; + + for (auto &Fn : M) { + if (Fn.isDeclaration()) + continue; + + CostType FnCost = 0; + TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn); + + for (const auto &BB : Fn) { + for (const auto &I : BB) { + auto Cost = + TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); + assert(Cost != InstructionCost::getMax()); + // Assume expensive if we can't tell the cost of an instruction. + CostType CostVal = + Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); + assert((FnCost + CostVal) >= FnCost && "Overflow!"); + FnCost += CostVal; + } + } + + assert(FnCost != 0); + + CostMap[&Fn] = FnCost; + assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); + ModuleCost += FnCost; + + if (isEntryPoint(&Fn)) + KernelCost += FnCost; + } + + CostType FnCost = (ModuleCost - KernelCost); + SML << "=> Total Module Cost: " << ModuleCost << '\n' + << " => KernelCost: " << KernelCost << " (" + << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n" + << " => FnsCost: " << FnCost << " (" + << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n"; + + return ModuleCost; +} + +static bool canBeIndirectlyCalled(const Function &F) { + if (F.isDeclaration() || isEntryPoint(&F)) + return false; + return !F.hasLocalLinkage() || + F.hasAddressTaken(/*PutOffender=*/nullptr, + /*IgnoreCallbackUses=*/false, + /*IgnoreAssumeLikeCalls=*/true, + /*IgnoreLLVMUsed=*/true, + /*IgnoreARCAttachedCall=*/false, + /*IgnoreCastedDirectCall=*/true); +} + +/// When a kernel or any of its callees performs an indirect call, this function +/// takes over \ref addAllDependencies and adds all potentially callable +/// functions to \p Fns so they can be counted as dependencies of the kernel. +/// +/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the +/// presence of an indirect call, the function's resource usage is the same as +/// the most expensive function in the module. +/// \param M The module. +/// \param Fns[out] Resulting list of functions. +static void addAllIndirectCallDependencies(const Module &M, + DenseSet &Fns) { + for (const auto &Fn : M) { + if (canBeIndirectlyCalled(Fn)) + Fns.insert(&Fn); + } +} + +/// Adds the functions that \p Fn may call to \p Fns, then recurses into each +/// callee until all reachable functions have been gathered. +/// +/// \param SML Log Helper +/// \param CG Call graph for \p Fn's module. +/// \param Fn Current function to look at. +/// \param Fns[out] Resulting list of functions. +/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some +/// point, either in \p Fn or in one of the function it calls. When that +/// happens, we fall back to adding all callable functions inside \p Fn's module +/// to \p Fns. +static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, + const Function &Fn, + DenseSet &Fns, + bool &HadIndirectCall) { + assert(!Fn.isDeclaration()); + + const Module &M = *Fn.getParent(); + SmallVector WorkList({&Fn}); + while (!WorkList.empty()) { + const auto &CurFn = *WorkList.pop_back_val(); + assert(!CurFn.isDeclaration()); + + // Scan for an indirect call. If such a call is found, we have to + // conservatively assume this can call all non-entrypoint functions in the + // module. + + for (auto &CGEntry : *CG[&CurFn]) { + auto *CGNode = CGEntry.second; + auto *Callee = CGNode->getFunction(); + if (!Callee) { + // Functions have an edge towards CallsExternalNode if they're external + // declarations, or if they do an indirect call. As we only process + // definitions here, we know this means the function has an indirect + // call. We then have to conservatively assume this can call all + // non-entrypoint functions in the module. + if (CGNode != CG.getCallsExternalNode()) + continue; // this is another function-less node we don't care about. + + SML << "Indirect call detected in " << getName(CurFn) + << " - treating all non-entrypoint functions as " + "potential dependencies\n"; + + // TODO: Print an ORE as well ? + addAllIndirectCallDependencies(M, Fns); + HadIndirectCall = true; + return; + } + + if (Callee->isDeclaration()) + continue; + + auto [It, Inserted] = Fns.insert(Callee); + if (Inserted) + WorkList.push_back(Callee); + } + } +} + +/// Contains information about a kernel and its dependencies. +struct KernelWithDependencies { + KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, + const DenseMap &FnCosts, + const Function *Fn) + : Fn(Fn) { + addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); + TotalCost = FnCosts.at(Fn); + for (const auto *Dep : Dependencies) { + TotalCost += FnCosts.at(Dep); + + // We cannot duplicate functions with external linkage, or functions that + // may be overriden at runtime. + HasNonDuplicatableDependecy |= + (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); + } + } + + const Function *Fn = nullptr; + DenseSet Dependencies; + /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. + bool HasIndirectCall = false; + /// Whether any of \p Fn's dependencies cannot be duplicated. + bool HasNonDuplicatableDependecy = false; + + CostType TotalCost = 0; + + /// \returns true if this kernel and its dependencies can be considered large + /// according to \p Threshold. + bool isLarge(CostType Threshold) const { + return TotalCost > Threshold && !Dependencies.empty(); + } +}; + +/// Calculates how much overlap there is between \p A and \p B. +/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A +/// and B have no shared elements. Kernels do not count in overlap calculation. +static float calculateOverlap(const DenseSet &A, + const DenseSet &B) { + DenseSet Total; + for (const auto *F : A) { + if (!isEntryPoint(F)) + Total.insert(F); + } + + if (Total.empty()) + return 0.0f; + + unsigned NumCommon = 0; + for (const auto *F : B) { + if (isEntryPoint(F)) + continue; + + auto [It, Inserted] = Total.insert(F); + if (!Inserted) + ++NumCommon; + } + + return static_cast(NumCommon) / Total.size(); +} + +/// Performs all of the partitioning work on \p M. +/// \param SML Log Helper +/// \param M Module to partition. +/// \param NumParts Number of partitions to create. +/// \param ModuleCost Total cost of all functions in \p M. +/// \param FnCosts Map of Function -> Cost +/// \param WorkList Kernels and their dependencies to process in order. +/// \returns The created partitions (a vector of size \p NumParts ) +static std::vector> +doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, + CostType ModuleCost, + const DenseMap &FnCosts, + const SmallVector &WorkList) { + + SML << "\n--Partitioning Starts--\n"; + + // Calculate a "large kernel threshold". When more than one kernel's total + // import cost exceeds this value, we will try to merge it with other, + // similarly large kernels. + // + // e.g. let two kernels X and Y have a import cost of ~10% of the module, we + // assign X to a partition as usual, but when we get to Y, we check if it's + // worth also putting it in Y's partition. + const CostType LargeKernelThreshold = + LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor) + : std::numeric_limits::max(); + + std::vector> Partitions; + Partitions.resize(NumParts); + + // Assign a partition to each kernel, and try to keep the partitions more or + // less balanced. We do that through a priority queue sorted in reverse, so we + // can always look at the partition with the least content. + // + // There are some cases where we will be deliberately unbalanced though. + // - Large kernels: we try to merge with existing partitions to reduce code + // duplication. + // - Kernels with indirect or external calls always go in the first partition + // (P0). + auto ComparePartitions = [](const std::pair &a, + const std::pair &b) { + // When two partitions have the same cost, assign to the one with the + // biggest ID first. This allows us to put things in P0 last, because P0 may + // have other stuff added later. + if (a.second == b.second) + return a.first < b.first; + return a.second > b.second; + }; + + // We can't use priority_queue here because we need to be able to access any + // element. This makes this a bit inefficient as we need to sort it again + // everytime we change it, but it's a very small array anyway (likely under 64 + // partitions) so it's a cheap operation. + std::vector> BalancingQueue; + for (unsigned I = 0; I < NumParts; ++I) + BalancingQueue.push_back(std::make_pair(I, 0)); + + // Helper function to handle assigning a kernel to a partition. This takes + // care of updating the balancing queue. + const auto AssignToPartition = [&](PartitionID PID, + const KernelWithDependencies &KWD) { + auto &FnsInPart = Partitions[PID]; + FnsInPart.insert(KWD.Fn); + FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); + + SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; + if (!KWD.Dependencies.empty()) { + SML << KWD.Dependencies.size() << " dependencies added\n"; + }; + + // Update the balancing queue. we scan backwards because in the common case + // the partition is at the end. + for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { + if (QueuePID == PID) { + CostType NewCost = 0; + for (auto *Fn : Partitions[PID]) + NewCost += FnCosts.at(Fn); + + SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; + if (Cost) { + SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) + << "% increase)"; + } + SML << '\n'; + + Cost = NewCost; + } + } + + sort(BalancingQueue, ComparePartitions); + }; + + for (auto &CurKernel : WorkList) { + // When a kernel has indirect calls, it must stay in the first partition + // alongside every reachable non-entry function. This is a nightmare case + // for splitting as it severely limits what we can do. + if (CurKernel.HasIndirectCall) { + SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) + << " defaulting to P0\n"; + AssignToPartition(0, CurKernel); + continue; + } + + // When a kernel has non duplicatable dependencies, we have to keep it in + // the first partition as well. This is a conservative approach, a + // finer-grained approach could keep track of which dependencies are + // non-duplicatable exactly and just make sure they're grouped together. + if (CurKernel.HasNonDuplicatableDependecy) { + SML << "Kernel with externally visible dependency " + << getName(*CurKernel.Fn) << " defaulting to P0\n"; + AssignToPartition(0, CurKernel); + continue; + } + + // Be smart with large kernels to avoid duplicating their dependencies. + if (CurKernel.isLarge(LargeKernelThreshold)) { + assert(LargeKernelOverlapForMerge >= 0.0f && + LargeKernelOverlapForMerge <= 1.0f); + SML << "Large Kernel: " << getName(*CurKernel.Fn) + << " - looking for partition with at least " + << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; + + bool Assigned = false; + for (const auto &[PID, Fns] : enumerate(Partitions)) { + float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); + SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" + << PID << '\n'; + if (Overlap > LargeKernelOverlapForMerge) { + SML << " selecting P" << PID << '\n'; + AssignToPartition(PID, CurKernel); + Assigned = true; + } + } + + if (Assigned) + continue; + } + + // Normal "load-balancing", assign to partition with least pressure. + auto [PID, CurCost] = BalancingQueue.back(); + AssignToPartition(PID, CurKernel); + } + + // Work is mostly done now, verify the partioning and add all functions we may + // have missed (= unreachable, or we don't understand how they're reached) to + // P0. + DenseSet AllFunctions; + for (const auto &[Idx, Part] : enumerate(Partitions)) { + CostType Cost = 0; + for (auto *Fn : Part) { + // external linkage functions should exclusively be in the first partition + // at this stage. In theory, we should only ever see external linkage + // functions here if they're kernels, or if they've been added due to a + // kernel using indirect calls somewhere in its CallGraph. + assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); + Cost += FnCosts.at(Fn); + } + SML << "P" << Idx << " has a total cost of " << Cost << " (" + << format("%0.2f", (float(Cost) / ModuleCost) * 100) + << "% of source module)\n"; + AllFunctions.insert(Part.begin(), Part.end()); + } + + // Add missed functions to P0. This will take care of adding things like + // external functions with no callers in the module to P0. This should be + // fairly rare as AMDGPU internalizes everything in most cases, so unused + // internal functions would get removed. + for (auto &Fn : M) { + if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { + SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; + Partitions[0].insert(&Fn); + } + } + + SML << "--Partitioning Done--\n\n"; + + return Partitions; +} + +static void externalize(GlobalValue &GV) { + if (GV.hasLocalLinkage()) { + GV.setLinkage(GlobalValue::ExternalLinkage); + GV.setVisibility(GlobalValue::HiddenVisibility); + } + + // Unnamed entities must be named consistently between modules. setName will + // give a distinct name to each such entity. + if (!GV.hasName()) + GV.setName("__llvmsplit_unnamed"); +} +} // end anonymous namespace + +void llvm::splitAMDGPUModule( + const AMDGPUTargetMachine &TM, Module &M, unsigned N, + function_ref MPart)> ModuleCallback) { + + SplitModuleLogger SML(M); + + CallGraph CG(M); + + // Externalize functions whose address are taken. + // + // This is needed because partitioning is purely based on calls, but sometimes + // a kernel/function may just look at the address of another local function + // and not do anything (no calls). After partitioning, that local function may + // end up in a different module (so it's just a declaration in the module + // where its address is taken), which emits a "undefined hidden symbol" linker + // error. + // + // Additionally, it guides partitioning to not duplicate this function if it's + // called directly at some point. + for (auto &Fn : M) { + if (Fn.hasAddressTaken()) { + if (Fn.hasLocalLinkage()) { + SML << "[externalize] " << Fn.getName() + << " because its address is taken\n"; + } + externalize(Fn); + } + } + + // Externalize local GVs, which avoids duplicating their initializers, which + // in turns helps keep code size in check. + if (!NoExternalizeGlobals) { + for (auto &GV : M.globals()) { + if (GV.hasLocalLinkage()) + SML << "[externalize] GV " << GV.getName() << '\n'; + externalize(GV); + } + } + + // Start by calculating the cost of every function in the module, as well as + // the module's overall cost. + DenseMap FnCosts; + const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts); + + // Gather every kernel into a WorkList, then sort it by descending total cost + // of the kernel so the biggest kernels are seen first. + SmallVector WorkList; + for (auto &Fn : M) { + if (isEntryPoint(&Fn) && !Fn.isDeclaration()) + WorkList.emplace_back(SML, CG, FnCosts, &Fn); + } + sort(WorkList, [&](auto &A, auto &B) { + // Sort by total cost, and if the total cost is identical, sort + // alphabetically. + if (A.TotalCost == B.TotalCost) + return A.Fn->getName() < B.Fn->getName(); + return A.TotalCost > B.TotalCost; + }); + + if (SML) { + SML << "Worklist\n"; + for (const auto &KWD : WorkList) { + SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost + << " indirect:" << KWD.HasIndirectCall + << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy + << ")\n"; + for (const auto *Dep : KWD.Dependencies) + SML << " [Dep] " << getName(*Dep) << '\n'; + } + } + + // This performs all of the partitioning work. + auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); + assert(Partitions.size() == N); + + // If we didn't externalize GVs, then local GVs need to be conservatively + // imported into every module (including their initializers), and then cleaned + // up afterwards. + const auto NeedsConservativeImport = [&](const GlobalValue *GV) { + // We conservatively import private/internal GVs into every module and clean + // them up afterwards. + const auto *Var = dyn_cast(GV); + return Var && Var->hasLocalLinkage(); + }; + + SML << "Creating " << N << " modules...\n"; + unsigned TotalFnImpls = 0; + for (unsigned I = 0; I < N; ++I) { + const auto &FnsInPart = Partitions[I]; + + ValueToValueMapTy VMap; + std::unique_ptr MPart( + CloneModule(M, VMap, [&](const GlobalValue *GV) { + // Functions go in their assigned partition. + if (const auto *Fn = dyn_cast(GV)) { +// Check we don't import an external linkage function in any +// partition other than P0. +#ifndef NDEBUG + if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { + assert((I == 0) == FnsInPart.contains(Fn)); + } +#endif + return FnsInPart.contains(Fn); + } + + if (NeedsConservativeImport(GV)) + return true; + + // Everything else goes in the first partition. + return I == 0; + })); + + // Clean-up conservatively imported GVs without any users. + for (auto &GV : make_early_inc_range(MPart->globals())) { + if (NeedsConservativeImport(&GV) && GV.use_empty()) + GV.eraseFromParent(); + } + + unsigned NumAllFns = 0, NumKernels = 0; + for (auto &Cur : *MPart) { + if (!Cur.isDeclaration()) { + ++NumAllFns; + if (isEntryPoint(&Cur)) + ++NumKernels; + } + } + TotalFnImpls += NumAllFns; + SML << " - Module " << I << " with " << NumAllFns << " functions (" + << NumKernels << " kernels)\n"; + ModuleCallback(std::move(MPart)); + } + + SML << TotalFnImpls << " function definitions across all modules (" + << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) + << "% of original module)\n"; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h new file mode 100644 index 000000000000..6171643bd4ad --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h @@ -0,0 +1,30 @@ +//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H +#define LLVM_TARGET_AMDGPUSPLITMODULE_H + +#include "llvm/ADT/STLFunctionalExtras.h" +#include + +namespace llvm { + +class Module; +class AMDGPUTargetMachine; + +/// Splits the module M into N linkable partitions. The function ModuleCallback +/// is called N times passing each individual partition as the MPart argument. +void splitAMDGPUModule( + const AMDGPUTargetMachine &TM, Module &M, unsigned N, + function_ref MPart)> ModuleCallback); + +} // end namespace llvm + +#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 20329dea6027..dbbfe34a6386 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -21,6 +21,7 @@ #include "AMDGPUIGroupLP.h" #include "AMDGPUMacroFusion.h" #include "AMDGPURegBankSelect.h" +#include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" @@ -815,6 +816,13 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { return AMDGPUAS::FLAT_ADDRESS; } +bool AMDGPUTargetMachine::splitModule( + Module &M, unsigned NumParts, + function_ref MPart)> ModuleCallback) const { + splitAMDGPUModule(*this, M, NumParts, ModuleCallback); + return true; +} + //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index e48cb8fdc657..2cfd232483a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -73,6 +73,10 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { getPredicatedAddrSpace(const Value *V) const override; unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override; + + bool splitModule(Module &M, unsigned NumParts, + function_ref MPart)> + ModuleCallback) const override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index ead81b402eb7..c992352cb78d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -98,6 +98,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPURewriteOutArguments.cpp AMDGPURewriteUndefForPHI.cpp AMDGPUSetWavePriority.cpp + AMDGPUSplitModule.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll new file mode 100644 index 000000000000..8b76237efa32 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll @@ -0,0 +1,46 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels: +; - A does a direct call to HelperA +; - B is storing @HelperA +; - C does a direct call to HelperA +; +; The helper functions will get externalized, which will force A and C into P0 as +; external functions cannot be duplicated. + +; CHECK0: define hidden void @HelperA() +; CHECK0: define amdgpu_kernel void @A() +; CHECK0: declare amdgpu_kernel void @B(ptr) +; CHECK0: define amdgpu_kernel void @C() + +; CHECK1: declare hidden void @HelperA() +; CHECK1: declare amdgpu_kernel void @A() +; CHECK1: declare amdgpu_kernel void @B(ptr) +; CHECK1: declare amdgpu_kernel void @C() + +; CHECK2: declare hidden void @HelperA() +; CHECK2: declare amdgpu_kernel void @A() +; CHECK2: define amdgpu_kernel void @B(ptr %dst) +; CHECK2: declare amdgpu_kernel void @C() + +define internal void @HelperA() { + ret void +} + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define amdgpu_kernel void @B(ptr %dst) { + store ptr @HelperA, ptr %dst + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperA() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll new file mode 100644 index 000000000000..46d7d9783aea --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll @@ -0,0 +1,37 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s + +; 2 kernels: +; - A is isolated +; - B is storing @HelperA/B's address +; +; The helper functions should get externalized (become hidden w/ external linkage) + +; CHECK0: define hidden void @HelperA() +; CHECK0: define hidden void @HelperB() +; CHECK0: define amdgpu_kernel void @A() +; CHECK0: declare amdgpu_kernel void @B(i1, ptr) + +; CHECK1: declare hidden void @HelperA() +; CHECK1: declare hidden void @HelperB() +; CHECK1: declare amdgpu_kernel void @A() +; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst) + +define internal void @HelperA() { + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B(i1 %cond, ptr %dst) { + %addr = select i1 %cond, ptr @HelperA, ptr @HelperB + store ptr %addr, ptr %dst + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll new file mode 100644 index 000000000000..6a07ed51ba1b --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll @@ -0,0 +1,20 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel +; REQUIRES: asserts + +; SHA256 of the kernel names. + +; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c +; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59 +; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55 + +define amdgpu_kernel void @MyCustomKernel0() { + ret void +} + +define amdgpu_kernel void @MyCustomKernel1() { + ret void +} + +define amdgpu_kernel void @MyCustomKernel2() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll new file mode 100644 index 000000000000..c2746d139892 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll @@ -0,0 +1,45 @@ +; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s + +; 3 kernels: +; - A calls nothing +; - B calls @PerryThePlatypus +; - C calls @Perry, an alias of @PerryThePlatypus +; +; We should see through the alias and put B/C in the same +; partition. +; +; Additionally, @PerryThePlatypus gets externalized as +; the alias counts as taking its address. + +; CHECK0-NOT: define +; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus +; CHECK0: define hidden void @PerryThePlatypus() +; CHECK0: define amdgpu_kernel void @B +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define + +@Perry = internal alias ptr(), ptr @PerryThePlatypus + +define internal void @PerryThePlatypus() { + ret void +} + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B() { + call void @PerryThePlatypus() + ret void +} + +define amdgpu_kernel void @C() { + call void @Perry() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll new file mode 100644 index 000000000000..4635264aefb3 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll @@ -0,0 +1,54 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels with each their own dependencies should go into 3 +; distinct partitions. The most expensive kernel should be +; seen first and go into the last partition. + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @C +; CHECK0: define internal void @HelperC +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @A +; CHECK1: define internal void @HelperA +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @B +; CHECK2: define internal void @HelperB +; CHECK2-NOT: define + + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define internal void @HelperA() { + ret void +} + +define amdgpu_kernel void @B(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + call void @HelperB() + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} + +define internal void @HelperC() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll new file mode 100644 index 000000000000..bea527f15bba --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll @@ -0,0 +1,50 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels with each their own dependencies should go into 3 +; distinct partitions. + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @C +; CHECK0: define internal void @HelperC +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @B +; CHECK1: define internal void @HelperB +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @A +; CHECK2: define internal void @HelperA +; CHECK2-NOT: define + + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define internal void @HelperA() { + ret void +} + +define amdgpu_kernel void @B() { + call void @HelperB() + ret void +} + +define internal void @HelperB() { + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} + +define internal void @HelperC() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll new file mode 100644 index 000000000000..64839f8d8456 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll @@ -0,0 +1,41 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels share a common helper, that helper should be +; cloned in all partitions. + +; CHECK0-NOT: define +; CHECK0: define internal void @Helper +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define internal void @Helper +; CHECK1: define amdgpu_kernel void @B +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @Helper +; CHECK2: define amdgpu_kernel void @A +; CHECK2-NOT: define + +define internal void @Helper() { + ret void +} + +define amdgpu_kernel void @A() { + call void @Helper() + ret void +} + +define amdgpu_kernel void @B() { + call void @Helper() + ret void +} + +define amdgpu_kernel void @C() { + call void @Helper() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll new file mode 100644 index 000000000000..435e97a58134 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll @@ -0,0 +1,64 @@ +; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s +; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s + +; Both overridable helper should go in P0. + +; CHECK0-NOT: define +; CHECK0: define available_externally void @OverridableHelper0() +; CHECK0: define internal void @OverridableHelper1() +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @PrivateHelper1() +; CHECK2: define amdgpu_kernel void @D +; CHECK2-NOT: define + +; CHECK3-NOT: define +; CHECK3: define internal void @PrivateHelper0() +; CHECK3: define amdgpu_kernel void @C +; CHECK3-NOT: define + +define available_externally void @OverridableHelper0() { + ret void +} + +define internal void @OverridableHelper1() #0 { + ret void +} + +define internal void @PrivateHelper0() { + ret void +} + +define internal void @PrivateHelper1() { + ret void +} + +define amdgpu_kernel void @A() { + call void @OverridableHelper0() + ret void +} + +define amdgpu_kernel void @B() { + call void @OverridableHelper1() + ret void +} + +define amdgpu_kernel void @C() { + call void @PrivateHelper0() + ret void +} + +define amdgpu_kernel void @D() { + call void @PrivateHelper1() + ret void +} + +attributes #0 = { nobuiltin } diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll new file mode 100644 index 000000000000..9701ac35ce54 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll @@ -0,0 +1,76 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; We have 4 kernels: +; - Each kernel has an internal helper +; - @A and @B's helpers does an indirect call. +; +; We default to putting A/B in P0, alongside a copy +; of all helpers who have their address taken. +; The other kernels can still go into separate partitions. + +; CHECK0-NOT: define +; CHECK0: define hidden void @HelperA +; CHECK0: define hidden void @HelperB +; CHECK0: define hidden void @CallCandidate +; CHECK0-NOT: define {{.*}} @HelperC +; CHECK0-NOT: define {{.*}} @HelperD +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define internal void @HelperD +; CHECK1: define amdgpu_kernel void @D +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @HelperC +; CHECK2: define amdgpu_kernel void @C +; CHECK2-NOT: define + +@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate] + +define internal void @HelperA(ptr %call) { + call void %call() + ret void +} + +define internal void @HelperB(ptr %call) { + call void %call() + ret void +} + +define internal void @CallCandidate() { + ret void +} + +define internal void @HelperC() { + ret void +} + +define internal void @HelperD() { + ret void +} + +define amdgpu_kernel void @A(ptr %call) { + call void @HelperA(ptr %call) + ret void +} + +define amdgpu_kernel void @B(ptr %call) { + call void @HelperB(ptr %call) + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} + +define amdgpu_kernel void @D() { + call void @HelperD() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll new file mode 100644 index 000000000000..dc2c5c3c07be --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll @@ -0,0 +1,40 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; CHECK0-NOT: define +; CHECK0: define void @ExternalHelper +; CHECK0: define amdgpu_kernel void @A +; CHECK0: define amdgpu_kernel void @B +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @D +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @C +; CHECK2-NOT: define + +define void @ExternalHelper() { + ret void +} + +define amdgpu_kernel void @A() { + call void @ExternalHelper() + ret void +} + +define amdgpu_kernel void @B() { + call void @ExternalHelper() + ret void +} + +define amdgpu_kernel void @C() { + ret void +} + +define amdgpu_kernel void @D() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll new file mode 100644 index 000000000000..0fc76934afc5 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll @@ -0,0 +1,42 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels use private/internal global variables. +; The GVs should be copied in each partition as needed. + +; CHECK0-NOT: define +; CHECK0: @bar = internal constant ptr +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: @foo = private constant ptr +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: @foo = private constant ptr +; CHECK2: @bar = internal constant ptr +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +@foo = private constant ptr poison +@bar = internal constant ptr poison + +define amdgpu_kernel void @A() { + store i32 42, ptr @foo + ret void +} + +define amdgpu_kernel void @B() { + store i32 42, ptr @foo + store i32 42, ptr @bar + ret void +} + +define amdgpu_kernel void @C() { + store i32 42, ptr @bar + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll new file mode 100644 index 000000000000..7564662e7c7c --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll @@ -0,0 +1,44 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; 3 kernels use private/internal global variables. +; The GVs should be copied in each partition as needed. + +; CHECK0-NOT: define +; CHECK0: @foo = hidden constant ptr poison +; CHECK0: @bar = hidden constant ptr poison +; CHECK0: define amdgpu_kernel void @C +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: @foo = external hidden constant ptr{{$}} +; CHECK1: @bar = external hidden constant ptr{{$}} +; CHECK1: define amdgpu_kernel void @A +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: @foo = external hidden constant ptr{{$}} +; CHECK2: @bar = external hidden constant ptr{{$}} +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +@foo = private constant ptr poison +@bar = internal constant ptr poison + +define amdgpu_kernel void @A() { + store i32 42, ptr @foo + ret void +} + +define amdgpu_kernel void @B() { + store i32 42, ptr @foo + store i32 42, ptr @bar + ret void +} + +define amdgpu_kernel void @C() { + store i32 42, ptr @bar + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll new file mode 100644 index 000000000000..5dfb95c5fc66 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll @@ -0,0 +1,75 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; Test load balancing logic with 6 kernels. +; +; Kernels go from most expensive (A == 6) to least expensive (F == 1) +; +; Load balancing should work like this (current partition cost is in parens) +; +; Initial -> [P0(0), P1(0), P2(0)] +; +; A(6) goes in 2 -> [P2(6), P0(0), P1(0)] +; B(5) goes in 1 -> [P2(6), P1(5), P0(4)] +; C(4) goes in 0 -> [P2(6), P1(5), P0(4)] + +; D(3) goes in 0 -> [P0(7), P2(6), P1(5)] +; E(2) goes in 1 -> [P0(7), P1(7), P2(6)] +; F(1) goes in 2 -> [P0(7), P1(7), P2(7)] + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @C +; CHECK0: define amdgpu_kernel void @D +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @B +; CHECK1: define amdgpu_kernel void @E +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @A +; CHECK2: define amdgpu_kernel void @F +; CHECK2-NOT: define + + +define amdgpu_kernel void @A(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + store i64 45, ptr %x + store i64 46, ptr %x + ret void +} + +define amdgpu_kernel void @B(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + store i64 45, ptr %x + ret void +} + +define amdgpu_kernel void @C(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + store i64 44, ptr %x + ret void +} + +define amdgpu_kernel void @D(ptr %x) { + store i64 42, ptr %x + store i64 43, ptr %x + ret void +} + +define amdgpu_kernel void @E(ptr %x) { + store i64 42, ptr %x + ret void +} + +define amdgpu_kernel void @F() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll new file mode 100644 index 000000000000..8959acfcae54 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll @@ -0,0 +1,39 @@ +; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s +; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s + +; Check that 4 independent kernels get put into 4 different partitions. + +; CHECK0-NOT: define +; CHECK0: define amdgpu_kernel void @D +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +; CHECK3-NOT: define +; CHECK3: define amdgpu_kernel void @A +; CHECK3-NOT: define + +define amdgpu_kernel void @A() { + ret void +} + +define amdgpu_kernel void @B() { + ret void +} + +define amdgpu_kernel void @C() { + ret void +} + +define amdgpu_kernel void @D() { + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll new file mode 100644 index 000000000000..4fdbac7d1789 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll @@ -0,0 +1,98 @@ +; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5 +; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s +; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s +; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s + +; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 +; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s +; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s +; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s + +; 2 kernels (A/B) are large and share all their dependencies. +; They should go in the same partition, the remaining kernel should +; go somewhere else, and one partition should be empty. +; +; Also check w/o large kernels processing to verify they are indeed handled +; differently. + +; CHECK0-NOT: define + +; CHECK1-NOT: define +; CHECK1: define internal void @HelperC() +; CHECK1: define amdgpu_kernel void @C +; CHECK1-NOT: define + +; CHECK2-NOT: define +; CHECK2: define internal void @large2() +; CHECK2: define internal void @large1() +; CHECK2: define internal void @large0() +; CHECK2: define internal void @HelperA() +; CHECK2: define internal void @HelperB() +; CHECK2: define amdgpu_kernel void @A +; CHECK2: define amdgpu_kernel void @B +; CHECK2-NOT: define + +; NOLARGEKERNELS-CHECK0-NOT: define +; NOLARGEKERNELS-CHECK0: define internal void @HelperC() +; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C +; NOLARGEKERNELS-CHECK0-NOT: define + +; NOLARGEKERNELS-CHECK1: define internal void @large2() +; NOLARGEKERNELS-CHECK1: define internal void @large1() +; NOLARGEKERNELS-CHECK1: define internal void @large0() +; NOLARGEKERNELS-CHECK1: define internal void @HelperB() +; NOLARGEKERNELS-CHECK1: define amdgpu_kernel void @B + +; NOLARGEKERNELS-CHECK2: define internal void @large2() +; NOLARGEKERNELS-CHECK2: define internal void @large1() +; NOLARGEKERNELS-CHECK2: define internal void @large0() +; NOLARGEKERNELS-CHECK2: define internal void @HelperA() +; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A + +define internal void @large2() { + store volatile i32 42, ptr null + call void @large2() + ret void +} + +define internal void @large1() { + call void @large1() + call void @large2() + ret void +} + +define internal void @large0() { + call void @large0() + call void @large1() + call void @large2() + ret void +} + +define internal void @HelperA() { + call void @large0() + ret void +} + +define internal void @HelperB() { + call void @large0() + ret void +} + +define amdgpu_kernel void @A() { + call void @HelperA() + ret void +} + +define amdgpu_kernel void @B() { + call void @HelperB() + ret void +} + +define internal void @HelperC() { + ret void +} + +define amdgpu_kernel void @C() { + call void @HelperC() + ret void +} diff --git a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg new file mode 100644 index 000000000000..6154a6c1c906 --- /dev/null +++ b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AMDGPU" in config.root.targets: + config.unsupported = True From 8d4d85de1eeb23bc13a51bffbc12d9f158ca1921 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 23 May 2024 18:31:08 +0800 Subject: [PATCH 023/433] Revert "[Coroutines] Always set the calling convention of generated resuming call from 'llvm.coro.await.suspend.handle' as fast" This reverts commit 31f1590e4fb324c43dc36199587c453e27b6f6fa. It looks like some bots are not happy about the FileChecks --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 2 +- .../coro-await-suspend-handle-in-ramp.ll | 59 ------------------- 2 files changed, 1 insertion(+), 60 deletions(-) delete mode 100644 llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 5a58a99d2879..1d9cf185b75a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -227,7 +227,6 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB, FunctionType *ResumeTy = FunctionType::get( Type::getVoidTy(Ctx), PointerType::getUnqual(Ctx), false); auto *ResumeCall = Builder.CreateCall(ResumeTy, ResumeAddr, {NewCall}); - ResumeCall->setCallingConv(CallingConv::Fast); // We can't insert the 'ret' instruction and adjust the cc until the // function has been split, so remember this for later. @@ -1089,6 +1088,7 @@ void CoroCloner::create() { // Turn symmetric transfers into musttail calls. for (CallInst *ResumeCall : Shape.SymmetricTransfers) { ResumeCall = cast(VMap[ResumeCall]); + ResumeCall->setCallingConv(NewF->getCallingConv()); if (TTI.supportsTailCallFor(ResumeCall)) { // FIXME: Could we support symmetric transfer effectively without // musttail? diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll deleted file mode 100644 index 85e8bb52fee3..000000000000 --- a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll +++ /dev/null @@ -1,59 +0,0 @@ -; Tests lowerings of different versions of coro.await.suspend -; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split),simplifycfg' -S | FileCheck %s - -%Awaiter = type {} - -define void @f() presplitcoroutine { -entry: - %awaiter = alloca %Awaiter - %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null) - %size = call i32 @llvm.coro.size.i32() - %alloc = call ptr @malloc(i32 %size) - %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc) - call void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle) - %suspend.init = call i8 @llvm.coro.suspend(token none, i1 false) - switch i8 %suspend.init, label %ret [ - i8 0, label %step - i8 1, label %cleanup - ] - -; Check the calling convention for resuming function is fastcc -; CHECK: define {{[^@]*}} @f() -; CHECK: entry: -; CHECK: %[[NEXT_HDL:.+]] = call ptr @await_suspend_wrapper_handle( -; CHECK-NEXT: %[[CONT:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[NEXT_HDL]], i8 0) -; CHECK-NEXT: musttail call fastcc void %[[CONT]](ptr %[[NEXT_HDL]]) -step: - br label %cleanup - -cleanup: - %mem = call ptr @llvm.coro.free(token %id, ptr %hdl) - call void @free(ptr %mem) - br label %ret - -ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) - ret void -} - -; check that we were haven't accidentally went out of @f body -; CHECK-LABEL: @f.resume( -; CHECK-LABEL: @f.destroy( -; CHECK-LABEL: @f.cleanup( - -declare ptr @await_suspend_wrapper_handle(ptr, ptr) - -declare ptr @llvm.coro.free(token, ptr) -declare i32 @llvm.coro.size.i32() -declare i8 @llvm.coro.suspend(token, i1) -declare void @llvm.coro.resume(ptr) -declare void @llvm.coro.destroy(ptr) - -declare token @llvm.coro.id(i32, ptr, ptr, ptr) -declare i1 @llvm.coro.alloc(token) -declare ptr @llvm.coro.begin(token, ptr) -declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) - -declare noalias ptr @malloc(i32) -declare void @free(ptr) From 6552af196812bfad22a8bec4d2cae3d51a84b1e4 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 23 May 2024 10:32:36 +0000 Subject: [PATCH 024/433] [gn build] Port d7c371300083 --- llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index c859b887828f..ab97507311a4 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -186,6 +186,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPURewriteOutArguments.cpp", "AMDGPURewriteUndefForPHI.cpp", "AMDGPUSetWavePriority.cpp", + "AMDGPUSplitModule.cpp", "AMDGPUSubtarget.cpp", "AMDGPUTargetMachine.cpp", "AMDGPUTargetObjectFile.cpp", From 3c23047413957024294872e38c27707c71d05805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Thu, 23 May 2024 12:56:16 +0200 Subject: [PATCH 025/433] [clang][analyzer] Move checker 'cert.pos.34c' (in alpha.security) into 'PutenvStackArray' (#92424) The "cert" package looks not useful and the checker has not a meaningful name with the old naming scheme. Additionally tests and documentation is updated. --- clang/docs/analyzer/checkers.rst | 74 +++++++------------ .../clang/StaticAnalyzer/Checkers/Checkers.td | 22 +++--- .../StaticAnalyzer/Checkers/CMakeLists.txt | 2 +- ...hecker.cpp => PutenvStackArrayChecker.cpp} | 26 ++++--- .../Analysis/cert/pos34-c-fp-suppression.cpp | 51 ------------- clang/test/Analysis/cert/pos34-c.cpp | 61 --------------- clang/test/Analysis/putenv-stack-array.c | 70 ++++++++++++++++++ 7 files changed, 119 insertions(+), 187 deletions(-) rename clang/lib/StaticAnalyzer/Checkers/{cert/PutenvWithAutoChecker.cpp => PutenvStackArrayChecker.cpp} (70%) delete mode 100644 clang/test/Analysis/cert/pos34-c-fp-suppression.cpp delete mode 100644 clang/test/Analysis/cert/pos34-c.cpp create mode 100644 clang/test/Analysis/putenv-stack-array.c diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index b4bd9dac1cbc..ac9f0b06f63b 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -2833,6 +2833,31 @@ Warn on mmap() calls that are both writable and executable. // code } +.. _alpha-security-putenv-stack-array: + +alpha.security.PutenvStackArray (C) +""""""""""""""""""""""""""""""""""" +Finds calls to the ``putenv`` function which pass a pointer to a stack-allocated +(automatic) array as the argument. Function ``putenv`` does not copy the passed +string, only a pointer to the data is stored and this data can be read even by +other threads. Content of a stack-allocated array is likely to be overwritten +after returning from the parent function. + +The problem can be solved by using a static array variable or dynamically +allocated memory. Even better is to avoid using ``putenv`` (it has other +problems related to memory leaks) and use ``setenv`` instead. + +The check corresponds to CERT rule +`POS34-C. Do not call putenv() with a pointer to an automatic variable as the argument +`_. + +.. code-block:: c + + int f() { + char env[] = "NAME=value"; + return putenv(env); // putenv function should not be called with stack-allocated string + } + .. _alpha-security-ReturnPtrRange: alpha.security.ReturnPtrRange (C) @@ -2859,55 +2884,6 @@ alpha.security.cert SEI CERT checkers which tries to find errors based on their `C coding rules `_. -.. _alpha-security-cert-pos-checkers: - -alpha.security.cert.pos -^^^^^^^^^^^^^^^^^^^^^^^ - -SEI CERT checkers of `POSIX C coding rules `_. - -.. _alpha-security-cert-pos-34c: - -alpha.security.cert.pos.34c -""""""""""""""""""""""""""" -Finds calls to the ``putenv`` function which pass a pointer to an automatic variable as the argument. - -.. code-block:: c - - int func(const char *var) { - char env[1024]; - int retval = snprintf(env, sizeof(env),"TEST=%s", var); - if (retval < 0 || (size_t)retval >= sizeof(env)) { - /* Handle error */ - } - - return putenv(env); // putenv function should not be called with auto variables - } - -Limitations: - - - Technically, one can pass automatic variables to ``putenv``, - but one needs to ensure that the given environment key stays - alive until it's removed or overwritten. - Since the analyzer cannot keep track of which envvars get overwritten - and when, it needs to be slightly more aggressive and warn for such - cases too, leading in some cases to false-positive reports like this: - - .. code-block:: c - - void baz() { - char env[] = "NAME=value"; - putenv(env); // false-positive warning: putenv function should not be called... - // More code... - putenv((char *)"NAME=anothervalue"); - // This putenv call overwrites the previous entry, thus that can no longer dangle. - } // 'env' array becomes dead only here. - -alpha.security.cert.env -^^^^^^^^^^^^^^^^^^^^^^^ - -SEI CERT checkers of `Environment C coding rules `_. - alpha.security.taint ^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td index d0ba1ce54840..40f443047bd4 100644 --- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td +++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -1035,15 +1035,6 @@ let ParentPackage = ENV in { } // end "security.cert.env" -let ParentPackage = POSAlpha in { - - def PutenvWithAuto : Checker<"34c">, - HelpText<"Finds calls to the 'putenv' function which pass a pointer to " - "an automatic variable as the argument.">, - Documentation; - -} // end "alpha.cert.pos" - let ParentPackage = SecurityAlpha in { def ArrayBoundChecker : Checker<"ArrayBound">, @@ -1054,10 +1045,6 @@ def ArrayBoundCheckerV2 : Checker<"ArrayBoundV2">, HelpText<"Warn about buffer overflows (newer checker)">, Documentation; -def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">, - HelpText<"Check for an out-of-bound pointer being returned to callers">, - Documentation; - def MallocOverflowSecurityChecker : Checker<"MallocOverflow">, HelpText<"Check for overflows in the arguments to malloc()">, Documentation; @@ -1078,6 +1065,15 @@ def MmapWriteExecChecker : Checker<"MmapWriteExec">, ]>, Documentation; +def PutenvStackArray : Checker<"PutenvStackArray">, + HelpText<"Finds calls to the function 'putenv' which pass a pointer to " + "an automatic (stack-allocated) array as the argument.">, + Documentation; + +def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">, + HelpText<"Check for an out-of-bound pointer being returned to callers">, + Documentation; + } // end "alpha.security" //===----------------------------------------------------------------------===// diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt index 45d3788f105d..cd5a3bdd02e4 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt +++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt @@ -96,7 +96,7 @@ add_clang_library(clangStaticAnalyzerCheckers PointerSortingChecker.cpp PointerSubChecker.cpp PthreadLockChecker.cpp - cert/PutenvWithAutoChecker.cpp + PutenvStackArrayChecker.cpp RetainCountChecker/RetainCountChecker.cpp RetainCountChecker/RetainCountDiagnostics.cpp ReturnPointerRangeChecker.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp similarity index 70% rename from clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp rename to clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp index a82f7caf16b2..d59cebf0aa5c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp @@ -1,4 +1,4 @@ -//== PutenvWithAutoChecker.cpp --------------------------------- -*- C++ -*--=// +//== PutenvStackArrayChecker.cpp ------------------------------- -*- C++ -*--=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// // -// This file defines PutenvWithAutoChecker which finds calls of ``putenv`` -// function with automatic variable as the argument. +// This file defines PutenvStackArrayChecker which finds calls of ``putenv`` +// function with automatic array variable as the argument. // https://wiki.sei.cmu.edu/confluence/x/6NYxBQ // //===----------------------------------------------------------------------===// -#include "../AllocationState.h" +#include "AllocationState.h" #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" #include "clang/StaticAnalyzer/Core/Checker.h" @@ -26,9 +26,9 @@ using namespace clang; using namespace ento; namespace { -class PutenvWithAutoChecker : public Checker { +class PutenvStackArrayChecker : public Checker { private: - BugType BT{this, "'putenv' function should not be called with auto variables", + BugType BT{this, "'putenv' called with stack-allocated string", categories::SecurityError}; const CallDescription Putenv{CDM::CLibrary, {"putenv"}, 1}; @@ -37,8 +37,8 @@ class PutenvWithAutoChecker : public Checker { }; } // namespace -void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call, - CheckerContext &C) const { +void PutenvStackArrayChecker::checkPostCall(const CallEvent &Call, + CheckerContext &C) const { if (!Putenv.matches(Call)) return; @@ -50,7 +50,7 @@ void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call, return; StringRef ErrorMsg = "The 'putenv' function should not be called with " - "arguments that have automatic storage"; + "arrays that have automatic storage"; ExplodedNode *N = C.generateErrorNode(); auto Report = std::make_unique(BT, ErrorMsg, N); @@ -60,8 +60,10 @@ void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call, C.emitReport(std::move(Report)); } -void ento::registerPutenvWithAuto(CheckerManager &Mgr) { - Mgr.registerChecker(); +void ento::registerPutenvStackArray(CheckerManager &Mgr) { + Mgr.registerChecker(); } -bool ento::shouldRegisterPutenvWithAuto(const CheckerManager &) { return true; } +bool ento::shouldRegisterPutenvStackArray(const CheckerManager &) { + return true; +} diff --git a/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp b/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp deleted file mode 100644 index d982fcb8a1ba..000000000000 --- a/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// RUN: %clang_analyze_cc1 \ -// RUN: -analyzer-checker=alpha.security.cert.pos.34c\ -// RUN: -verify %s - -#include "../Inputs/system-header-simulator.h" -void free(void *memblock); -void *malloc(size_t size); -int putenv(char *); -int rand(); - -namespace test_auto_var_used_good { - -extern char *ex; -int test_extern() { - return putenv(ex); // no-warning: extern storage class. -} - -void foo(void) { - char *buffer = (char *)"huttah!"; - if (rand() % 2 == 0) { - buffer = (char *)malloc(5); - strcpy(buffer, "woot"); - } - putenv(buffer); -} - -void bar(void) { - char *buffer = (char *)malloc(5); - strcpy(buffer, "woot"); - - if (rand() % 2 == 0) { - free(buffer); - buffer = (char *)"blah blah blah"; - } - putenv(buffer); -} - -void baz() { - char env[] = "NAME=value"; - // TODO: False Positive - putenv(env); - // expected-warning@-1 {{The 'putenv' function should not be called with arguments that have automatic storage}} - - /* - DO SOMETHING - */ - - putenv((char *)"NAME=anothervalue"); -} - -} // namespace test_auto_var_used_good diff --git a/clang/test/Analysis/cert/pos34-c.cpp b/clang/test/Analysis/cert/pos34-c.cpp deleted file mode 100644 index f2bd7b393d88..000000000000 --- a/clang/test/Analysis/cert/pos34-c.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// RUN: %clang_analyze_cc1 \ -// RUN: -analyzer-checker=alpha.security.cert.pos.34c\ -// RUN: -verify %s - -// Examples from the CERT rule's page. -// https://wiki.sei.cmu.edu/confluence/x/6NYxBQ - -#include "../Inputs/system-header-simulator.h" -void free(void *memblock); -void *malloc(size_t size); -int putenv(char *); -int snprintf(char *str, size_t size, const char *format, ...); - -namespace test_auto_var_used_bad { - -int volatile_memory1(const char *var) { - char env[1024]; - int retval = snprintf(env, sizeof(env), "TEST=%s", var); - if (retval < 0 || (size_t)retval >= sizeof(env)) { - /* Handle error */ - } - - return putenv(env); - // expected-warning@-1 {{The 'putenv' function should not be called with arguments that have automatic storage}} -} - -} // namespace test_auto_var_used_bad - -namespace test_auto_var_used_good { - -int test_static(const char *var) { - static char env[1024]; - - int retval = snprintf(env, sizeof(env), "TEST=%s", var); - if (retval < 0 || (size_t)retval >= sizeof(env)) { - /* Handle error */ - } - - return putenv(env); -} - -int test_heap_memory(const char *var) { - static char *oldenv; - const char *env_format = "TEST=%s"; - const size_t len = strlen(var) + strlen(env_format); - char *env = (char *)malloc(len); - if (env == NULL) { - return -1; - } - if (putenv(env) != 0) { // no-warning: env was dynamically allocated. - free(env); - return -1; - } - if (oldenv != NULL) { - free(oldenv); /* avoid memory leak */ - } - oldenv = env; - return 0; -} - -} // namespace test_auto_var_used_good diff --git a/clang/test/Analysis/putenv-stack-array.c b/clang/test/Analysis/putenv-stack-array.c new file mode 100644 index 000000000000..fbbf93259ab8 --- /dev/null +++ b/clang/test/Analysis/putenv-stack-array.c @@ -0,0 +1,70 @@ +// RUN: %clang_analyze_cc1 \ +// RUN: -analyzer-checker=alpha.security.PutenvStackArray \ +// RUN: -verify %s + +#include "Inputs/system-header-simulator.h" +void free(void *); +void *malloc(size_t); +int putenv(char *); +int snprintf(char *, size_t, const char *, ...); + +int test_auto_var(const char *var) { + char env[1024]; + (void)snprintf(env, sizeof(env), "TEST=%s", var); + return putenv(env); // expected-warning{{The 'putenv' function should not be called with arrays that have automatic storage}} +} + +int test_static_var(const char *var) { + static char env[1024]; + (void)snprintf(env, sizeof(env), "TEST=%s", var); + return putenv(env); // no-warning: static array is used +} + +void test_heap_memory(const char *var) { + const char *env_format = "TEST=%s"; + const size_t len = strlen(var) + strlen(env_format); + char *env = (char *)malloc(len); + if (env == NULL) + return; + if (putenv(env) != 0) // no-warning: env was dynamically allocated. + free(env); +} + +typedef struct { + int A; + char Env[1024]; +} Mem; + +int test_auto_var_struct() { + Mem mem; + return putenv(mem.Env); // expected-warning{{The 'putenv' function should not be called with}} +} + +int test_auto_var_subarray() { + char env[1024]; + return putenv(env + 100); // expected-warning{{The 'putenv' function should not be called with}} +} + +int test_constant() { + char *env = "TEST"; + return putenv(env); // no-warning: data is not on the stack +} + +extern char *ext_env; +int test_extern() { + return putenv(ext_env); // no-warning: extern storage class. +} + +void test_auto_var_reset() { + char env[] = "NAME=value"; + putenv(env); // expected-warning{{The 'putenv' function should not be called with}} + // ... (do something) + // Even cases like this are likely a bug: + // It looks like that if one string was passed to putenv, + // it should not be deallocated at all, because when reading the + // environment variable a pointer into this string is returned. + // In this case, if another (or the same) thread reads variable "NAME" + // at this point and does not copy the returned string, the data may + // become invalid. + putenv((char *)"NAME=anothervalue"); +} From 964079d6501af7073e4420a221511b3067130ac4 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 23 May 2024 10:56:45 +0000 Subject: [PATCH 026/433] [gn build] Port 3c2304741395 --- .../gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn index 333d74e5c720..3ae50b214eb1 100644 --- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn @@ -104,6 +104,7 @@ static_library("Checkers") { "PointerSortingChecker.cpp", "PointerSubChecker.cpp", "PthreadLockChecker.cpp", + "PutenvStackArrayChecker.cpp", "RetainCountChecker/RetainCountChecker.cpp", "RetainCountChecker/RetainCountDiagnostics.cpp", "ReturnPointerRangeChecker.cpp", @@ -148,6 +149,5 @@ static_library("Checkers") { "WebKit/UncountedLambdaCapturesChecker.cpp", "WebKit/UncountedLocalVarsChecker.cpp", "cert/InvalidPtrChecker.cpp", - "cert/PutenvWithAutoChecker.cpp", ] } From b6468766f73e7a294fce71da60b9ec90f3900281 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 23 May 2024 12:10:21 +0100 Subject: [PATCH 027/433] [LAA] refactor program logic (NFC) (#92101) Implement NFC improvements spotted during a cursory reading of LoopAccessAnalysis. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 99 +++++++++++------------- 1 file changed, 45 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 2a967f570c4a..bc8b9b8479e4 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -392,9 +392,9 @@ void RuntimePointerChecking::generateChecks( bool RuntimePointerChecking::needsChecking( const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { - for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I) - for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J) - if (needsChecking(M.Members[I], N.Members[J])) + for (const auto &I : M.Members) + for (const auto &J : N.Members) + if (needsChecking(I, J)) return true; return false; } @@ -408,9 +408,7 @@ static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, if (!C) return nullptr; - if (C->getValue()->isNegative()) - return J; - return I; + return C->getValue()->isNegative() ? J : I; } bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, @@ -508,8 +506,8 @@ void RuntimePointerChecking::groupChecks( DenseMap> PositionMap; for (unsigned Index = 0; Index < Pointers.size(); ++Index) { - auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}}); - Iter.first->second.push_back(Index); + auto [It, _] = PositionMap.insert({Pointers[Index].PointerValue, {}}); + It->second.push_back(Index); } // We need to keep track of what pointers we've already seen so we @@ -608,16 +606,16 @@ void RuntimePointerChecking::printChecks( raw_ostream &OS, const SmallVectorImpl &Checks, unsigned Depth) const { unsigned N = 0; - for (const auto &Check : Checks) { - const auto &First = Check.first->Members, &Second = Check.second->Members; + for (const auto &[Check1, Check2] : Checks) { + const auto &First = Check1->Members, &Second = Check2->Members; OS.indent(Depth) << "Check " << N++ << ":\n"; - OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n"; + OS.indent(Depth + 2) << "Comparing group (" << Check1 << "):\n"; for (unsigned K = 0; K < First.size(); ++K) OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n"; - OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n"; + OS.indent(Depth + 2) << "Against group (" << Check2 << "):\n"; for (unsigned K = 0; K < Second.size(); ++K) OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n"; } @@ -1158,8 +1156,8 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, // First, count how many write and read accesses are in the alias set. Also // collect MemAccessInfos for later. SmallVector AccessInfos; - for (const Value *Ptr_ : ASPointers) { - Value *Ptr = const_cast(Ptr_); + for (const Value *ConstPtr : ASPointers) { + Value *Ptr = const_cast(ConstPtr); bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); if (IsWrite) ++NumWritePtrChecks; @@ -1215,9 +1213,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, // We know that we need these checks, so we can now be more aggressive // and add further checks if required (overflow checks). CanDoAliasSetRT = true; - for (auto Retry : Retries) { - MemAccessInfo Access = Retry.first; - Type *AccessTy = Retry.second; + for (const auto &[Access, AccessTy] : Retries) { if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, DepSetId, TheLoop, RunningDepId, ASId, ShouldCheckWrap, /*Assume=*/true)) { @@ -1289,12 +1285,11 @@ void AccessAnalysis::processMemAccesses() { LLVM_DEBUG(dbgs() << " AST: "; AST.dump()); LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n"); LLVM_DEBUG({ - for (auto A : Accesses) - dbgs() << "\t" << *A.first.getPointer() << " (" - << (A.first.getInt() - ? "write" - : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only" - : "read")) + for (const auto &[A, _] : Accesses) + dbgs() << "\t" << *A.getPointer() << " (" + << (A.getInt() ? "write" + : (ReadOnlyPtr.count(A.getPointer()) ? "read-only" + : "read")) << ")\n"; }); @@ -1323,16 +1318,16 @@ void AccessAnalysis::processMemAccesses() { bool UseDeferred = SetIteration > 0; PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; - for (const Value *Ptr_ : ASPointers) { - Value *Ptr = const_cast(Ptr_); + for (const Value *ConstPtr : ASPointers) { + Value *Ptr = const_cast(ConstPtr); // For a single memory access in AliasSetTracker, Accesses may contain // both read and write, and they both need to be handled for CheckDeps. - for (const auto &AC : S) { - if (AC.first.getPointer() != Ptr) + for (const auto &[AC, _] : S) { + if (AC.getPointer() != Ptr) continue; - bool IsWrite = AC.first.getInt(); + bool IsWrite = AC.getInt(); // If we're using the deferred access set, then it contains only // reads. @@ -1859,10 +1854,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, // (If so, then we have proven (**) because |Dist| >= -1*Dist) const SCEV *NegDist = SE.getNegativeSCEV(CastedDist); Minus = SE.getMinusSCEV(NegDist, CastedProduct); - if (SE.isKnownPositive(Minus)) - return true; - - return false; + return SE.isKnownPositive(Minus); } /// Check the dependence for two accesses with the same stride \p Stride. @@ -2050,7 +2042,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( if (isa(Dist)) { // TODO: Relax requirement that there is a common stride to retry with // non-constant distance dependencies. - FoundNonConstantDistanceDependence |= !!CommonStride; + FoundNonConstantDistanceDependence |= CommonStride.has_value(); LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n"); return Dependence::Unknown; } @@ -2093,11 +2085,10 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( if (HasSameSize) { // Write to the same location with the same size. return Dependence::Forward; - } else { - LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " - "different type sizes\n"); - return Dependence::Unknown; } + LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " + "different type sizes\n"); + return Dependence::Unknown; } bool IsTrueDataDependence = (AIsWrite && !BIsWrite); @@ -2343,7 +2334,7 @@ bool MemoryDepChecker::areDepsSafe( } ++OI; } - AI++; + ++AI; } } @@ -2352,8 +2343,8 @@ bool MemoryDepChecker::areDepsSafe( } SmallVector -MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const { - MemAccessInfo Access(Ptr, isWrite); +MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { + MemAccessInfo Access(Ptr, IsWrite); auto &IndexVector = Accesses.find(Access)->second; SmallVector Insts; @@ -2729,13 +2720,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, } void LoopAccessInfo::emitUnsafeDependenceRemark() { - auto Deps = getDepChecker().getDependences(); + const auto *Deps = getDepChecker().getDependences(); if (!Deps) return; - auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) { - return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) != - MemoryDepChecker::VectorizationSafetyStatus::Safe; - }); + const auto *Found = + llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) { + return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) != + MemoryDepChecker::VectorizationSafetyStatus::Safe; + }); if (Found == Deps->end()) return; MemoryDepChecker::Dependence Dep = *Found; @@ -2874,9 +2866,9 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { // Check that all of the gep indices are uniform except for our induction // operand. - for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) - if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) + for (unsigned I = 0, E = GEP->getNumOperands(); I != E; ++I) + if (I != InductionOperand && + !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(I)), Lp)) return Ptr; return GEP->getOperand(InductionOperand); } @@ -3072,9 +3064,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, DepChecker = std::make_unique(*PSE, L, MaxTargetVectorWidthInBits); PtrRtChecking = std::make_unique(*DepChecker, SE); - if (canAnalyzeLoop()) { + if (canAnalyzeLoop()) analyzeLoop(AA, LI, TLI, DT); - } } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { @@ -3126,13 +3117,13 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { } const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) { - auto I = LoopAccessInfoMap.insert({&L, nullptr}); + auto [It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr}); - if (I.second) - I.first->second = + if (Inserted) + It->second = std::make_unique(&L, &SE, TTI, TLI, &AA, &DT, &LI); - return *I.first->second; + return *It->second; } bool LoopAccessInfoManager::invalidate( From 1d0e8b24001d854a848a3810b90244a6bc94cf03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 12:17:47 +0200 Subject: [PATCH 028/433] [clang][Interp] Remove a no longer needed dummy check Since we now have type info for dummy pointers, we don't need this check anymore and can also have the same output for the test case in records.cpp. --- clang/lib/AST/Interp/Interp.h | 3 --- clang/test/AST/Interp/records.cpp | 6 ++---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 8430a7de24df..fc496b66445a 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1371,9 +1371,6 @@ inline bool GetPtrVirtBasePop(InterpState &S, CodePtr OpPC, const Pointer &Ptr = S.Stk.pop(); if (!CheckNull(S, OpPC, Ptr, CSK_Base)) return false; - if (Ptr.isDummy()) // FIXME: Once we have type info for dummy pointers, this - // needs to go. - return false; return VirtBaseHelper(S, OpPC, D, Ptr); } diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp index 3a5ecd291a56..97ac3e916955 100644 --- a/clang/test/AST/Interp/records.cpp +++ b/clang/test/AST/Interp/records.cpp @@ -1335,8 +1335,6 @@ namespace UnnamedBitFields { static_assert(a.c == 'a', ""); } -/// FIXME: This still doesn't work in the new interpreter because -/// we lack type information for dummy pointers. namespace VirtualBases { /// This used to crash. namespace One { @@ -1346,7 +1344,7 @@ namespace VirtualBases { }; class B : public virtual A { public: - int getX() { return x; } // ref-note {{declared here}} + int getX() { return x; } // both-note {{declared here}} }; class DV : virtual public B{}; @@ -1354,7 +1352,7 @@ namespace VirtualBases { void foo() { DV b; int a[b.getX()]; // both-warning {{variable length arrays}} \ - // ref-note {{non-constexpr function 'getX' cannot be used}} + // both-note {{non-constexpr function 'getX' cannot be used}} } } From 55e5842385ef18eaf7b5b6548413f4ee6f555dfc Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 23 May 2024 12:12:22 +0100 Subject: [PATCH 029/433] [mlir][OpenMP] Remove deprecated omp.reduction (#92732) This operation did not model the behaviour of reductions in the openmp standard. It has since been replaced by block arguments on the outer operation. See https://github.com/llvm/llvm-project/pull/79308 and https://github.com/llvm/llvm-project/pull/80019 --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 46 ++------ .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp | 22 +--- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 17 +-- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 107 ------------------ mlir/test/Dialect/OpenMP/invalid.mlir | 3 - mlir/test/Dialect/OpenMP/ops.mlir | 10 -- 6 files changed, 10 insertions(+), 195 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 122abbe7cc97..dc9ac2b9de22 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -152,13 +152,9 @@ def ParallelOp : OpenMP_Op<"parallel", [ variable should be passed into the reduction region by value or by reference in `reduction_vars_byref`. Each reduction is identified by the accumulator it uses and accumulators must not be repeated in the same reduction. The - `omp.reduction` operation accepts the accumulator and a partial value which - is considered to be produced by the thread for the given reduction. If - multiple values are produced for the same accumulator, i.e. there are - multiple `omp.reduction`s, the last value is taken. The reduction - declaration specifies how to combine the values from each thread into the - final value, which is available in the accumulator after all the threads - complete. + reduction declaration specifies how to combine the values from each thread + into the final value, which is available in the accumulator after all the + threads complete. The optional $proc_bind_val attribute controls the thread affinity for the execution of the parallel region. @@ -307,13 +303,9 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments, accumulator variables in `reduction_vars` and symbols referring to reduction declarations in the `reductions` attribute. Each reduction is identified by the accumulator it uses and accumulators must not be repeated in the same - reduction. The `omp.reduction` operation accepts the accumulator and a - partial value which is considered to be produced by the section for the - given reduction. If multiple values are produced for the same accumulator, - i.e. there are multiple `omp.reduction`s, the last value is taken. The - reduction declaration specifies how to combine the values from each section - into the final value, which is available in the accumulator after all the - sections complete. + reduction. The reduction declaration specifies how to combine the values + from each section into the final value, which is available in the + accumulator after all the sections complete. The $allocators_vars and $allocate_vars parameters are a variadic list of values that specify the memory allocator to be used to obtain storage for private values. @@ -912,11 +904,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments, variables in `reduction_vars` or `in_reduction_vars` and symbols referring to reduction declarations in the `reductions` or `in_reductions` attribute. Each reduction is identified by the accumulator it uses and accumulators - must not be repeated in the same reduction. The `omp.reduction` operation - accepts the accumulator and a partial value which is considered to be - produced by the current loop iteration for the given reduction. If multiple - values are produced for the same accumulator, i.e. there are multiple - `omp.reduction`s, the last value is taken. The reduction declaration + must not be repeated in the same reduction. The reduction declaration specifies how to combine the values from each iteration into the final value, which is available in the accumulator after the loop completes. @@ -2159,24 +2147,4 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol, let hasRegionVerifier = 1; } -//===----------------------------------------------------------------------===// -// 2.19.5.4 reduction clause -//===----------------------------------------------------------------------===// - -def ReductionOp : OpenMP_Op<"reduction"> { - let summary = "reduction construct"; - let description = [{ - Indicates the value that is produced by the current reduction-participating - entity for a reduction requested in some ancestor. The reduction is - identified by the accumulator, but the value of the accumulator may not be - updated immediately. - }]; - - let arguments= (ins AnyType:$operand, OpenMP_PointerLikeType:$accumulator); - let assemblyFormat = [{ - $operand `,` $accumulator attr-dict `:` type($operand) `,` type($accumulator) - }]; - let hasVerifier = 1; -} - #endif // OPENMP_OPS diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index a206c7b228d2..f6a6d1d7228a 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -185,21 +185,6 @@ struct MapInfoOpConversion : public ConvertOpToLLVMPattern { } }; -struct ReductionOpConversion : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - LogicalResult - matchAndRewrite(omp::ReductionOp curOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (isa(curOp.getAccumulator().getType())) { - // TODO: Support memref type in variable operands - return rewriter.notifyMatchFailure(curOp, "memref is not supported yet"); - } - rewriter.replaceOpWithNewOp( - curOp, TypeRange(), adaptor.getOperands(), curOp->getAttrs()); - return success(); - } -}; - template struct MultiRegionOpConversion : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -246,9 +231,6 @@ void mlir::configureOpenMPToLLVMConversionLegality( return typeConverter.isLegal(op->getOperandTypes()) && typeConverter.isLegal(op->getResultTypes()); }); - target.addDynamicallyLegalOp([&](Operation *op) { - return typeConverter.isLegal(op->getOperandTypes()); - }); target.addDynamicallyLegalOp< mlir::omp::AtomicUpdateOp, mlir::omp::CriticalOp, mlir::omp::TargetOp, mlir::omp::TargetDataOp, mlir::omp::LoopNestOp, @@ -275,11 +257,11 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter, [&](omp::MapBoundsType type) -> Type { return type; }); patterns.add< - AtomicReadOpConversion, MapInfoOpConversion, ReductionOpConversion, + AtomicReadOpConversion, MapInfoOpConversion, MultiRegionOpConversion, MultiRegionOpConversion, RegionOpConversion, RegionOpConversion, - RegionOpConversion, ReductionOpConversion, + RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 24a6d5b5d684..110873011fe3 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1789,7 +1789,7 @@ LogicalResult DistributeOp::verify() { } //===----------------------------------------------------------------------===// -// ReductionOp +// DeclareReductionOp //===----------------------------------------------------------------------===// static ParseResult parseAtomicReductionRegion(OpAsmParser &parser, @@ -1881,21 +1881,6 @@ LogicalResult DeclareReductionOp::verifyRegions() { return success(); } -LogicalResult ReductionOp::verify() { - auto *op = (*this)->getParentWithTrait(); - if (!op) - return emitOpError() << "must be used within an operation supporting " - "reduction clause interface"; - while (op) { - for (const auto &var : - cast(op).getAllReductionVars()) - if (var == getAccumulator()) - return success(); - op = op->getParentWithTrait(); - } - return emitOpError() << "the accumulator is not used by the parent"; -} - //===----------------------------------------------------------------------===// // TaskOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 9d125b7f1180..6ec4c120c11e 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -334,54 +334,6 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder, return success(); } -/// Returns a reduction declaration that corresponds to the given reduction -/// operation in the given container. Currently only supports reductions inside -/// WsloopOp and ParallelOp but can be easily extended as long as the given -/// construct implements getNumReductionVars. -template -static std::optional -findReductionDeclInContainer(T container, omp::ReductionOp reduction) { - for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) { - if (container.getReductionVars()[i] != reduction.getAccumulator()) - continue; - - SymbolRefAttr reductionSymbol = - cast((*container.getReductions())[i]); - auto declareOp = - SymbolTable::lookupNearestSymbolFrom( - container, reductionSymbol); - return declareOp; - } - return std::nullopt; -} - -/// Searches for a reduction in a provided region and the regions -/// it is nested in -static omp::DeclareReductionOp findReductionDecl(Operation &containerOp, - omp::ReductionOp reduction) { - std::optional declareOp = std::nullopt; - Operation *container = &containerOp; - - while (!declareOp.has_value() && container) { - // Check if current container is supported for reductions searches - if (auto par = dyn_cast(*container)) { - declareOp = findReductionDeclInContainer(par, reduction); - } else if (auto loop = dyn_cast(*container)) { - declareOp = findReductionDeclInContainer(loop, reduction); - } else { - break; - } - - // See if we can search parent for reductions as well - container = containerOp.getParentOp(); - } - - assert(declareOp.has_value() && - "reduction operation must be associated with a declaration"); - - return *declareOp; -} - /// Populates `reductions` with reduction declarations used in the given loop. template static void @@ -1786,62 +1738,6 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp, return updateGenStatus; } -/// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the -/// mapping between reduction variables and their private equivalents to have -/// been stored on the ModuleTranslation stack. Currently only supports -/// reduction within WsloopOp and ParallelOp, but can be easily extended. -static LogicalResult -convertOmpReductionOp(omp::ReductionOp reductionOp, - llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation) { - // Find the declaration that corresponds to the reduction op. - omp::DeclareReductionOp declaration; - Operation *reductionParent = reductionOp->getParentOp(); - if (dyn_cast(reductionParent) || - dyn_cast(reductionParent)) { - declaration = findReductionDecl(*reductionParent, reductionOp); - } else { - llvm_unreachable("Unhandled reduction container"); - } - assert(declaration && "could not find reduction declaration"); - - // Retrieve the mapping between reduction variables and their private - // equivalents. - const DenseMap *reductionVariableMap = nullptr; - moduleTranslation.stackWalk( - [&](const OpenMPVarMappingStackFrame &frame) { - if (frame.mapping.contains(reductionOp.getAccumulator())) { - reductionVariableMap = &frame.mapping; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - assert(reductionVariableMap && "couldn't find private reduction variables"); - // Translate the reduction operation by emitting the body of the corresponding - // reduction declaration. - Region &reductionRegion = declaration.getReductionRegion(); - llvm::Value *privateReductionVar = - reductionVariableMap->lookup(reductionOp.getAccumulator()); - llvm::Value *reductionVal = builder.CreateLoad( - moduleTranslation.convertType(reductionOp.getOperand().getType()), - privateReductionVar); - - moduleTranslation.mapValue(reductionRegion.front().getArgument(0), - reductionVal); - moduleTranslation.mapValue( - reductionRegion.front().getArgument(1), - moduleTranslation.lookupValue(reductionOp.getOperand())); - - SmallVector phis; - if (failed(inlineConvertOmpRegions(reductionRegion, "omp.reduction.body", - builder, moduleTranslation, &phis))) - return failure(); - assert(phis.size() == 1 && "expected one value to be yielded from " - "the reduction body declaration region"); - builder.CreateStore(phis[0], privateReductionVar); - return success(); -} - /// Converts an OpenMP Threadprivate operation into LLVM IR using /// OpenMPIRBuilder. static LogicalResult @@ -3350,9 +3246,6 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, .Case([&](omp::ParallelOp op) { return convertOmpParallel(op, builder, moduleTranslation); }) - .Case([&](omp::ReductionOp reductionOp) { - return convertOmpReductionOp(reductionOp, builder, moduleTranslation); - }) .Case([&](omp::MasterOp) { return convertOmpMaster(*op, builder, moduleTranslation); }) diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index db016fe8e7ba..115d164b6cc7 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -648,7 +648,6 @@ func.func @foo(%lb : index, %ub : index, %step : index) { omp.wsloop reduction(@foo %0 -> %prv : !llvm.ptr) { omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { %2 = arith.constant 2.0 : f32 - omp.reduction %2, %1 : f32, !llvm.ptr omp.yield } omp.terminator @@ -678,7 +677,6 @@ func.func @foo(%lb : index, %ub : index, %step : index) { omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr, @add_f32 %0 -> %prv1 : !llvm.ptr) { omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { %2 = arith.constant 2.0 : f32 - omp.reduction %2, %0 : f32, !llvm.ptr omp.yield } omp.terminator @@ -713,7 +711,6 @@ func.func @foo(%lb : index, %ub : index, %step : index, %mem : memref<1xf32>) { omp.wsloop reduction(@add_f32 %mem -> %prv : memref<1xf32>) { omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { %2 = arith.constant 2.0 : f32 - omp.reduction %2, %mem : f32, memref<1xf32> omp.yield } omp.terminator diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 0d5fd9383a92..caf25a3cb59f 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -1003,8 +1003,6 @@ func.func @omp_teams(%lb : i32, %ub : i32, %if_cond : i1, %num_threads : i32, // CHECK: omp.teams reduction(@add_f32 -> %{{.+}} : !llvm.ptr) { omp.teams reduction(@add_f32 -> %0 : !llvm.ptr) { %1 = arith.constant 2.0 : f32 - // CHECK: omp.reduction %{{.+}}, %{{.+}} - omp.reduction %1, %0 : f32, !llvm.ptr // CHECK: omp.terminator omp.terminator } @@ -1028,15 +1026,11 @@ func.func @sections_reduction() { // CHECK: omp.section omp.section { %1 = arith.constant 2.0 : f32 - // CHECK: omp.reduction %{{.+}}, %{{.+}} - omp.reduction %1, %0 : f32, !llvm.ptr omp.terminator } // CHECK: omp.section omp.section { %1 = arith.constant 3.0 : f32 - // CHECK: omp.reduction %{{.+}}, %{{.+}} - omp.reduction %1, %0 : f32, !llvm.ptr omp.terminator } omp.terminator @@ -1130,14 +1124,10 @@ func.func @sections_reduction2() { omp.sections reduction(@add2_f32 -> %0 : memref<1xf32>) { omp.section { %1 = arith.constant 2.0 : f32 - // CHECK: omp.reduction - omp.reduction %1, %0 : f32, memref<1xf32> omp.terminator } omp.section { %1 = arith.constant 2.0 : f32 - // CHECK: omp.reduction - omp.reduction %1, %0 : f32, memref<1xf32> omp.terminator } omp.terminator From 85b9826c34bbefb0bdd42697edfda7d5192c5d1e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 May 2024 12:14:07 +0100 Subject: [PATCH 030/433] [AMDGPU] Regenerate sad.ll test checks Improve test checks for better codegen review of #92576 --- llvm/test/CodeGen/AMDGPU/sad.ll | 369 +++++++++++++++++++++++++++----- 1 file changed, 310 insertions(+), 59 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 1b0306559295..0492c5663e66 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -1,8 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}v_sad_u32_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -16,9 +27,18 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, ret void } -; GCN-LABEL: {{^}}v_sad_u32_constant_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20 define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) { +; GCN-LABEL: v_sad_u32_constant_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x5a +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, 90 %t0 = select i1 %icmp0, i32 %a, i32 90 @@ -32,9 +52,19 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ret void } -; GCN-LABEL: {{^}}v_sad_u32_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -46,12 +76,28 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat1: -; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_max_u32 s3, s0, s1 +; GCN-NEXT: s_min_u32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s3, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -66,9 +112,25 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_add_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -82,9 +144,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_max_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_max_u32 s3, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b store volatile i32 %t0, ptr addrspace(5) undef @@ -99,9 +179,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_min_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_min_u32 s3, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -117,9 +215,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s3, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: flat_store_dword v[0:1], v3 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b store volatile i32 %sub0, ptr addrspace(5) undef @@ -132,11 +248,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ret void } -; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: -; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: v_sad_u32_multi_use_select_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s3, s0, s1 +; GCN-NEXT: s_sub_i32 s6, s1, s0 +; GCN-NEXT: s_cmp_gt_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s3, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b %sub1 = sub i32 %b, %a @@ -149,12 +283,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_sad_u32_vector_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; GCN-LABEL: v_sad_u32_vector_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_sad_u32 v3, s11, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_sad_u32 v2, s10, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_sad_u32 v1, s9, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_sad_u32 v0, s8, v0, v4 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt <4 x i32> %a, %b %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b @@ -168,12 +319,29 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 ret void } -; GCN-LABEL: {{^}}v_sad_u32_vector_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; GCN-LABEL: v_sad_u32_vector_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_sad_u32 v3, s11, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_sad_u32 v2, s10, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_sad_u32 v1, s9, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_sad_u32 v0, s8, v0, v4 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt <4 x i32> %a, %b %sub0 = sub <4 x i32> %a, %b %sub1 = sub <4 x i32> %b, %a @@ -185,10 +353,22 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32 ret void } -; GCN-LABEL: {{^}}v_sad_u32_i16_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) { - +; GCN-LABEL: v_sad_u32_i16_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s4, s6, 0xffff +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_sad_u32 v2, s4, v1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i16 %a, %b %t0 = select i1 %icmp0, i16 %a, i16 %b @@ -202,9 +382,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ret void } -; GCN-LABEL: {{^}}v_sad_u32_i16_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { +; GCN-LABEL: v_sad_u32_i16_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: flat_load_ushort v1, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_load_ushort v2, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sad_u32 v2, v0, v1, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_endpgm %a = load volatile i16, ptr addrspace(1) undef %b = load volatile i16, ptr addrspace(1) undef %c = load volatile i16, ptr addrspace(1) undef @@ -219,9 +412,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ret void } -; GCN-LABEL: {{^}}v_sad_u32_i8_pat1: -; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) { +; GCN-LABEL: v_sad_u32_i8_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s3, s2, 0xff +; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s3, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i8 %a, %b %t0 = select i1 %icmp0, i8 %a, i8 %b @@ -235,9 +441,22 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b ret void } -; GCN-LABEL: {{^}}v_sad_u32_i8_pat2: -; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { +; GCN-LABEL: v_sad_u32_i8_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: flat_load_ubyte v1, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_load_ubyte v2, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sad_u32 v2, v0, v1, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm %a = load volatile i8, ptr addrspace(1) undef %b = load volatile i8, ptr addrspace(1) undef %c = load volatile i8, ptr addrspace(1) undef @@ -252,15 +471,26 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ret void } -; GCN-LABEL: {{^}}s_sad_u32_i8_pat2: -; GCN: s_load_dword -; GCN-DAG: s_bfe_u32 -; GCN-DAG: s_sub_i32 -; GCN-DAG: s_and_b32 -; GCN-DAG: s_sub_i32 -; GCN-DAG: s_lshr_b32 -; GCN: s_add_i32 define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { +; GCN-LABEL: s_sad_u32_i8_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s4, s2, 8 +; GCN-NEXT: s_and_b32 s3, s2, 0xff +; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GCN-NEXT: s_lshr_b32 s6, s2, 16 +; GCN-NEXT: s_sub_i32 s7, s2, s4 +; GCN-NEXT: s_sub_i32 s2, s4, s2 +; GCN-NEXT: s_cmp_gt_u32 s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s7, s2 +; GCN-NEXT: s_add_i32 s2, s2, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: flat_store_byte v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i8 %a, %b %sub0 = sub i8 %a, %b %sub1 = sub i8 %b, %a @@ -272,12 +502,22 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ret void } -; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1: -; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { +; GCN-LABEL: v_sad_u32_mismatched_operands_pat1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_max_u32 s6, s0, s1 +; GCN-NEXT: s_cmp_le_u32 s0, s1 +; GCN-NEXT: s_cselect_b32 s0, s0, s3 +; GCN-NEXT: s_sub_i32 s0, s6, s0 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -291,11 +531,22 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2: -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { +; GCN-LABEL: v_sad_u32_mismatched_operands_pat2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s3, s0, s3 +; GCN-NEXT: s_sub_i32 s6, s1, s0 +; GCN-NEXT: s_cmp_lt_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s0, s3, s6 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %d %sub1 = sub i32 %b, %a From 27f53b266cbfcbc956687be45cb0fdc80667b98e Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Thu, 23 May 2024 13:21:40 +0200 Subject: [PATCH 031/433] [Offload] Disable flaky test on host-offloading (#93174) While we investigate the issue, we disable the test on host-offloading so the buildbots are back to more useful state. Issue is tracked: https://github.com/llvm/llvm-project/issues/93173 --- offload/test/offloading/dynamic_module.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/offload/test/offloading/dynamic_module.c b/offload/test/offloading/dynamic_module.c index f1e9862002a1..9dcf3a1ae649 100644 --- a/offload/test/offloading/dynamic_module.c +++ b/offload/test/offloading/dynamic_module.c @@ -2,6 +2,8 @@ // RUN: %libomptarget-compile-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic // RUN: %libomptarget-compileopt-generic -DSHARED -fPIC -shared -o %t.so && \ // RUN: %libomptarget-compileopt-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic +// +// UNSUPPORTED: x86_64-pc-linux-gnu #ifdef SHARED void foo() {} From 4e0f8a4919b1920ed715ca19314e6b3e06a70763 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 23 May 2024 12:22:09 +0100 Subject: [PATCH 032/433] [AMDGPU] Fix EXPENSIVE_CHECKS failure in #89612 --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 99109b23a159..04d9bb5cb18a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2089,7 +2089,7 @@ bool AMDGPUInstructionSelector::selectPOPSExitingWaveID( // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked // mayLoad/mayStore and tablegen complains about the mismatch. auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) - .addDef(AMDGPU::SRC_POPS_EXITING_WAVE_ID); + .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID); MI.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } From 5b5af52bad647321d19476cea39293802fa3cfd3 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Thu, 23 May 2024 12:46:09 +0100 Subject: [PATCH 033/433] [AArch64] Extend efficient lowering of experimental.cttz.elts (#92114) This patch extends support for more efficient lowering of the experimental.cttz.elts intrinsic to fixed-width vector types, by first creating an SVE predicate register mask from the fixed-width vector. --- .../Target/AArch64/AArch64ISelLowering.cpp | 21 ++- .../Analysis/CostModel/AArch64/cttz_elts.ll | 32 ++-- .../AArch64/intrinsic-cttz-elts-sve.ll | 146 ++++++++++++++++++ 3 files changed, 179 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bbc896dac77f..df17767138d9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1871,9 +1871,11 @@ bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { if (!Subtarget->hasSVEorSME()) return true; - // We can only use the BRKB + CNTP sequence with legal predicate types. + // We can only use the BRKB + CNTP sequence with legal predicate types. We can + // also support fixed-width predicates. return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 && - VT != MVT::nxv2i1; + VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 && + VT != MVT::v4i1 && VT != MVT::v2i1; } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { @@ -5838,9 +5840,20 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); } case Intrinsic::experimental_cttz_elts: { - SDValue NewCttzElts = - DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1)); + SDValue CttzOp = Op.getOperand(1); + EVT VT = CttzOp.getValueType(); + assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1"); + if (VT.isFixedLengthVector()) { + // We can use SVE instructions to lower this intrinsic by first creating + // an SVE predicate register mask from the fixed-width vector. + EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT); + SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp); + CttzOp = convertFixedMaskToScalableVector(Mask, DAG); + } + + SDValue NewCttzElts = + DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp); return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType()); } } diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll index cc1532ee33dc..e1a9ee114d26 100644 --- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll +++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll @@ -13,15 +13,15 @@ define void @foo_no_vscale_range() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) @@ -33,15 +33,15 @@ define void @foo_no_vscale_range() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll index 211237542a15..9c72afd84fa7 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -359,6 +359,152 @@ define i32 @add_i32_ctz_nxv16i1_poison( %a, i32 %b) { ret i32 %add } +; FIXED-WIDTH VECTOR TYPES + +define i32 @ctz_v16i1(<16 x i1> %a) { +; CHECK-LABEL: ctz_v16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0) + ret i32 %res +} + +define i32 @ctz_v16i1_poison(<16 x i1> %a) { +; CHECK-LABEL: ctz_v16i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1) + ret i32 %res +} + +define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) { +; CHECK-LABEL: add_i64_ctz_v16i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: incp x0, p0.b +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1) + %add = add i64 %res, %b + ret i64 %add +} + +define i32 @ctz_v8i1(<8 x i1> %a) { +; CHECK-LABEL: ctz_v8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0) + ret i32 %res +} + +define i32 @ctz_v8i1_poison(<8 x i1> %a) { +; CHECK-LABEL: ctz_v8i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1) + ret i32 %res +} + +define i32 @ctz_v4i1(<4 x i1> %a) { +; CHECK-LABEL: ctz_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.h +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0) + ret i32 %res +} + +define i32 @ctz_v4i1_poison(<4 x i1> %a) { +; CHECK-LABEL: ctz_v4i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.h +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1) + ret i32 %res +} + +define i32 @ctz_v2i1(<2 x i1> %a) { +; CHECK-LABEL: ctz_v2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #31 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.s +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0) + ret i32 %res +} + +define i32 @ctz_v2i1_poison(<2 x i1> %a) { +; CHECK-LABEL: ctz_v2i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #31 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.s +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1) + ret i32 %res +} + declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(, i1) declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(, i1) declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(, i1) From 84729c9df30b29d5f4e903ad71235a6aa0c764d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 13:10:13 +0200 Subject: [PATCH 034/433] [clang][Interp] Don't diagnose ObjCIvarDecls as invalid reads --- clang/lib/AST/Interp/Interp.cpp | 6 ++++++ clang/test/AST/Interp/objc.mm | 8 ++++++++ 2 files changed, 14 insertions(+) create mode 100644 clang/test/AST/Interp/objc.mm diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index cac678352e2c..de5390596d63 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -18,6 +18,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/ASTDiagnostic.h" #include "clang/AST/CXXInheritance.h" +#include "clang/AST/DeclObjC.h" #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "llvm/ADT/APSInt.h" @@ -101,6 +102,11 @@ static void diagnoseNonConstVariable(InterpState &S, CodePtr OpPC, return; } + // Rather random, but this is to match the diagnostic output of the current + // interpreter. + if (isa(VD)) + return; + if (VD->getType()->isIntegralOrEnumerationType()) { S.FFDiag(Loc, diag::note_constexpr_ltor_non_const_int, 1) << VD; S.Note(VD->getLocation(), diag::note_declared_at); diff --git a/clang/test/AST/Interp/objc.mm b/clang/test/AST/Interp/objc.mm new file mode 100644 index 000000000000..44b74d193b66 --- /dev/null +++ b/clang/test/AST/Interp/objc.mm @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -verify=ref,both %s + +@interface A { + int a; + static_assert(a, ""); // both-error {{static assertion expression is not an integral constant expression}} +} +@end From fbd643fb22607b933a4e129ae86a7334b62c9b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 23 May 2024 14:01:32 +0200 Subject: [PATCH 035/433] [clang][Interp] Don't try to activate root pointers No inline descriptor means we can't do that. --- clang/lib/AST/Interp/Interp.h | 10 ++++++---- clang/test/AST/Interp/cxx98.cpp | 4 ++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index fc496b66445a..bcb6fb4d6521 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1335,17 +1335,19 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) { inline bool FinishInitPop(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.pop(); - if (Ptr.canBeInitialized()) + if (Ptr.canBeInitialized()) { Ptr.initialize(); - Ptr.activate(); + Ptr.activate(); + } return true; } inline bool FinishInit(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.peek(); - if (Ptr.canBeInitialized()) + if (Ptr.canBeInitialized()) { Ptr.initialize(); - Ptr.activate(); + Ptr.activate(); + } return true; } diff --git a/clang/test/AST/Interp/cxx98.cpp b/clang/test/AST/Interp/cxx98.cpp index be81735329db..e68e4dbc8d74 100644 --- a/clang/test/AST/Interp/cxx98.cpp +++ b/clang/test/AST/Interp/cxx98.cpp @@ -50,3 +50,7 @@ _Static_assert(c0_test == 0, ""); int a = 0; // both-note {{declared here}} _Static_assert(a == 0, ""); // both-error {{static assertion expression is not an integral constant expression}} \ // both-note {{read of non-const variable 'a' is not allowed in a constant expression}} + +struct SelfReference { SelfReference &r; }; +extern SelfReference self_reference_1; +SelfReference self_reference_2 = {self_reference_1}; From 63a4133912b5b737c75c18c609d711de6f70367f Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Thu, 23 May 2024 13:23:48 +0100 Subject: [PATCH 036/433] [flang][debug] Support complex types. (#92559) This PR adds supports for conversion of complex type to corresponding DITypeAttr. Both fir and mlir types are supported. Apart from lit testing, I have also tested the types in debugger and they work correctly. An exception is 128 bit complex which somehow requires that its name be different from `complex`. I am going to open a separate PR to add (kind=n) in the type names similar to what gfortran does. --- .../Transforms/DebugTypeGenerator.cpp | 12 ++++++ flang/test/Integration/debug-complex-1.f90 | 26 +++++++++++++ flang/test/Transforms/debug-complex-1.fir | 39 +++++++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 flang/test/Integration/debug-complex-1.f90 create mode 100644 flang/test/Transforms/debug-complex-1.fir diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 64c6547e06e0..1e46d5ac255e 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -57,6 +57,18 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, mlir::StringAttr::get(context, logTy.getMnemonic()), kindMapping.getLogicalBitsize(logTy.getFKind()), llvm::dwarf::DW_ATE_boolean); + } else if (fir::isa_complex(Ty)) { + unsigned bitWidth; + if (auto cplxTy = mlir::dyn_cast_or_null(Ty)) { + auto floatTy = mlir::cast(cplxTy.getElementType()); + bitWidth = floatTy.getWidth(); + } else if (auto cplxTy = mlir::dyn_cast_or_null(Ty)) { + bitWidth = kindMapping.getRealBitsize(cplxTy.getFKind()); + } else { + llvm_unreachable("Unhandled complex type"); + } + return genBasicType(context, mlir::StringAttr::get(context, "complex"), + bitWidth * 2, llvm::dwarf::DW_ATE_complex_float); } else { // FIXME: These types are currently unhandled. We are generating a // placeholder type to allow us to test supported bits. diff --git a/flang/test/Integration/debug-complex-1.f90 b/flang/test/Integration/debug-complex-1.f90 new file mode 100644 index 000000000000..c8d0da4c4baa --- /dev/null +++ b/flang/test/Integration/debug-complex-1.f90 @@ -0,0 +1,26 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +program mn + complex(kind=4) :: c4 + complex(kind=8) :: c8 + complex(kind=16) :: r + r = fn1(c4, c8) + print *, r +contains + function fn1(a, b) result (c) + complex(kind=4), intent(in) :: a + complex(kind=8), intent(in) :: b + complex(kind=16) :: c + c = a + b + end function +end program + +! CHECK-DAG: ![[C4:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float) +! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex", size: 128, encoding: DW_ATE_complex_float) +! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex", size: 256, encoding: DW_ATE_complex_float) +! CHECK-DAG: !DILocalVariable(name: "c4"{{.*}}type: ![[C4]]) +! CHECK-DAG: !DILocalVariable(name: "c8"{{.*}}type: ![[C8]]) +! CHECK-DAG: !DILocalVariable(name: "r"{{.*}}type: ![[C16]]) +! CHECK-DAG: !DILocalVariable(name: "a"{{.*}}type: ![[C4]]) +! CHECK-DAG: !DILocalVariable(name: "b"{{.*}}type: ![[C8]]) +! CHECK-DAG: !DILocalVariable(name: "c"{{.*}}type: ![[C16]]) diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir new file mode 100644 index 000000000000..a3cbd767d8a5 --- /dev/null +++ b/flang/test/Transforms/debug-complex-1.fir @@ -0,0 +1,39 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + +// check conversion of complex type of different size. Both fir and mlir +// variants are checked. + +module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} { + func.func @test1(%x : !fir.complex<4>) -> !fir.complex<8> { + %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<8> + return %1 : !fir.complex<8> + }loc(#loc1) + func.func @test2(%x : !fir.complex<4>) -> complex { + %1 = fir.convert %x : (!fir.complex<4>) -> complex + return %1 : complex + }loc(#loc2) + func.func @test3(%x : !fir.complex<4>) -> !fir.complex<16> { + %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<16> + return %1 : !fir.complex<16> + }loc(#loc3) + func.func @test4(%x : !fir.complex<4>) -> complex { + %1 = fir.convert %x : (!fir.complex<4>) -> complex + return %1 : complex + }loc(#loc4) +} +#loc1 = loc("./simple.f90":2:1) +#loc2 = loc("./simple.f90":5:1) +#loc3 = loc("./simple.f90":8:1) +#loc4 = loc("./simple.f90":11:1) + +// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type +// CHECK-DAG: #[[CMPX4:.*]] = #llvm.di_basic_type +// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type + +// CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX8]], #[[CMPX4]]> +// CHECK-DAG: #[[TY2:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX16]], #[[CMPX4]]> + +// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test1"{{.*}}type = #[[TY1]]> +// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test2"{{.*}}type = #[[TY1]]> +// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test3"{{.*}}type = #[[TY2]]> +// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test4"{{.*}}type = #[[TY2]]> From 7b185719a8bed373b83b92db73eaf8b84af8da72 Mon Sep 17 00:00:00 2001 From: 2LoS Date: Thu, 23 May 2024 14:24:01 +0200 Subject: [PATCH 037/433] Removed redundant template in '__delete_node()' member function of '__forward_list_base' and '__list_imp' classes. (#84323) --- libcxx/include/forward_list | 1 - libcxx/include/list | 1 - 2 files changed, 2 deletions(-) diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 5a7521eed410..80dd49fe3d75 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -554,7 +554,6 @@ protected: return __guard.__release_ptr(); } - template _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) { // For the same reason as above, we use the allocator's destroy() method for the value_type, // but not for the node itself. diff --git a/libcxx/include/list b/libcxx/include/list index 90bddcc29db0..610a24e38460 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -567,7 +567,6 @@ protected: return __guard.__release_ptr(); } - template _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) { // For the same reason as above, we use the allocator's destroy() method for the value_type, // but not for the node itself. From 01f143dd39dc14029943dcf6eb2f7bbc2d82d6d4 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Thu, 23 May 2024 09:27:40 -0300 Subject: [PATCH 038/433] [clang] fix printing of canonical template template parameters (#93124) --- clang/lib/AST/TemplateBase.cpp | 14 ++++++++++++-- clang/test/SemaCXX/cxx20-ctad-type-alias.cpp | 2 +- clang/test/SemaTemplate/deduction-guide.cpp | 10 +++++----- clang/test/SemaTemplate/make_integer_seq.cpp | 4 ++-- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp index 3310d7dc24c5..a7ee973b7f7d 100644 --- a/clang/lib/AST/TemplateBase.cpp +++ b/clang/lib/AST/TemplateBase.cpp @@ -538,9 +538,19 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out, Out << "nullptr"; break; - case Template: - getAsTemplate().print(Out, Policy, TemplateName::Qualified::Fully); + case Template: { + TemplateName TN = getAsTemplate(); + if (const auto *TD = TN.getAsTemplateDecl(); + TD && TD->getDeclName().isEmpty()) { + assert(isa(TD) && + "Unexpected anonymous template"); + const auto *TTP = cast(TD); + Out << "template-parameter-" << TTP->getDepth() << "-" << TTP->getIndex(); + } else { + TN.print(Out, Policy, TemplateName::Qualified::Fully); + } break; + } case TemplateExpansion: getAsTemplateOrTemplatePattern().print(Out, Policy); diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index 4c6ef5adae7d..b71dfc6ccaf4 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -284,7 +284,7 @@ class Foo {}; // Verify that template template type parameter TTP is referenced/used in the // template arguments of the RHS. template typename TTP> -using Bar = Foo>; // expected-note {{candidate template ignored: could not match 'Foo>' against 'int'}} +using Bar = Foo>; // expected-note {{candidate template ignored: could not match 'Foo>' against 'int'}} template class Container {}; diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp index 0eaeb49e6b32..c38b647e42f4 100644 --- a/clang/test/SemaTemplate/deduction-guide.cpp +++ b/clang/test/SemaTemplate/deduction-guide.cpp @@ -102,9 +102,9 @@ using CT = C; // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 V // CHECK: | `-TemplateArgument {{.*}} expr // CHECK: | `-IntegerLiteral {{.*}} 'int' 0 -// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C' +// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y, type-parameter-0-2) -> C' // CHECK: | |-ParmVarDecl {{.*}} 'A' -// CHECK: | |-ParmVarDecl {{.*}} 'Y<>' +// CHECK: | |-ParmVarDecl {{.*}} 'Y' // CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-2' // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int, Y, int) -> C' // CHECK: |-TemplateArgument type 'int' @@ -114,12 +114,12 @@ using CT = C; // CHECK: |-ParmVarDecl {{.*}} 'int' // CHECK: |-ParmVarDecl {{.*}} 'Y' // CHECK: `-ParmVarDecl {{.*}} 'int' -// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C' dependent trailing_return cdecl +// CHECK: FunctionProtoType {{.*}} 'auto (A, Y, type-parameter-0-2) -> C' dependent trailing_return cdecl // CHECK: |-InjectedClassNameType {{.*}} 'C' dependent // CHECK: |-TemplateTypeParmType {{.*}} 'A' dependent depth 0 index 0 // CHECK: | `-TemplateTypeParm {{.*}} 'A' -// CHECK: |-ElaboratedType {{.*}} 'Y<>' sugar dependent -// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<>' dependent Y +// CHECK: |-ElaboratedType {{.*}} 'Y' sugar dependent +// CHECK: | `-TemplateSpecializationType {{.*}} 'Y' dependent Y // CHECK: | `-TemplateArgument template // CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2 diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp index 3a692f5ae2bf..c5a1e2705368 100644 --- a/clang/test/SemaTemplate/make_integer_seq.cpp +++ b/clang/test/SemaTemplate/make_integer_seq.cpp @@ -61,7 +61,7 @@ using test2 = B; template