From 02b4dd04626c31f225db2bb8e9e0b82cddd28c25 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Thu, 20 Nov 2025 17:34:15 -0800 Subject: [PATCH 01/49] [AArch64] Restrict TBI to ignore top 4 bits for Darwin targets. In order to allow arm64 code to run on MTE environments, we need to make the compiler only assume the top 4 bits can be ignored as MTE occupies the lower 4. rdar://164645323 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 7 ++++++- llvm/test/CodeGen/AArch64/tbi.ll | 14 +++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 30eb19036ddda..3012343386c07 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24759,7 +24759,12 @@ static SDValue performPostLD1Combine(SDNode *N, static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { - APInt DemandedMask = APInt::getLowBitsSet(64, 56); + const auto &Subtarget = DAG.getSubtarget(); + // If MTE is enabled, TBI only applies to the top 4 bits. + // Both arm64 and arm64e processes on Darwin may run with MTE enabled. + unsigned NumIgnoreBits = + Subtarget.hasMTE() || Subtarget.isTargetDarwin() ? 4 : 8; + APInt DemandedMask = APInt::getLowBitsSet(64, 64 - NumIgnoreBits); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); diff --git a/llvm/test/CodeGen/AArch64/tbi.ll b/llvm/test/CodeGen/AArch64/tbi.ll index 285726a485b87..a7c9b4cddf808 100644 --- a/llvm/test/CodeGen/AArch64/tbi.ll +++ b/llvm/test/CodeGen/AArch64/tbi.ll @@ -7,7 +7,7 @@ ; TBI-NOT: and x ; NO_TBI: and x define i32 @ld_and32(i64 %p) { - %and = and i64 %p, 72057594037927935 + %and = and i64 %p, 1152921504606846975 %cast = inttoptr i64 %and to ptr %load = load i32, ptr %cast ret i32 %load @@ -18,7 +18,7 @@ define i32 @ld_and32(i64 %p) { ; TBI-NOT: and x ; NO_TBI: and x define i32 @ld_and_plus_offset(i64 %p) { - %and = and i64 %p, 72057594037927935 + %and = and i64 %p, 1152921504606846975 %cast = inttoptr i64 %and to ptr %gep = getelementptr i32, ptr %cast, i64 4 %load = load i32, ptr %gep @@ -40,7 +40,7 @@ define i32 @ld_and32_wider(i64 %p) { ; TBI-NOT: and x ; NO_TBI: and x define i64 @ld_and64(i64 %p) { - %and = and i64 %p, 72057594037927935 + %and = and i64 %p, 1152921504606846975 %cast = inttoptr i64 %and to ptr %load = load i64, ptr %cast ret i64 %load @@ -50,7 +50,7 @@ define i64 @ld_and64(i64 %p) { ; TBI-NOT: and x ; NO_TBI: and x define void @st_and32(i64 %p, i32 %v) { - %and = and i64 %p, 72057594037927935 + %and = and i64 %p, 1152921504606846975 %cast = inttoptr i64 %and to ptr store i32 %v, ptr %cast ret void @@ -62,7 +62,7 @@ define void @st_and32(i64 %p, i32 %v) { ; NO_TBI: and x define i32 @ld_ro(i64 %a, i64 %b) { %p = add i64 %a, %b - %and = and i64 %p, 72057594037927935 + %and = and i64 %p, 1152921504606846975 %cast = inttoptr i64 %and to ptr %load = load i32, ptr %cast ret i32 %load @@ -73,7 +73,7 @@ define i32 @ld_ro(i64 %a, i64 %b) { ; TBI-NOT: and x ; NO_TBI: and x define i32 @ld_ro2(i64 %a, i64 %b) { - %and = and i64 %a, 72057594037927935 + %and = and i64 %a, 1152921504606846975 %p = add i64 %and, %b %cast = inttoptr i64 %p to ptr %load = load i32, ptr %cast @@ -85,7 +85,7 @@ define i32 @ld_ro2(i64 %a, i64 %b) { ; TBI-NOT: and x ; NO_TBI: and x define i32 @ld_indirect_and(i64 %r1, i64 %r2) { - %and = and i64 %r1, 72057594037927935 + %and = and i64 %r1, 1152921504606846975 %p = or i64 %and, %r2 %cast = inttoptr i64 %p to ptr %load = load i32, ptr %cast From 6dafa096f417378e7e7b5540fca14d8a0b4b4e59 Mon Sep 17 00:00:00 2001 From: jimingham Date: Wed, 10 Dec 2025 16:26:25 -0800 Subject: [PATCH 02/49] Add a _regexp-break-add and some more tests for the b alias. (#171236) This commit leaves "b" aliased to the old _regexp-break for now. The two variants are identical except that `_regexp-break` allows you to say: `(lldb) b ` which gets translated to: `break set ` So switching people to `_regexp-break-add` would be a surprising behavior change. It would be wrong for `_regexp_break-add` have one branch that call `break set`, so to avoid surprise, I'll add the command and let people who are playing with `break add` instead of `break set` can set the alias to the new one by hand for now. --- .../source/Interpreter/CommandInterpreter.cpp | 88 +++++++++++++++++++ .../TestRegexpBreakCommand.py | 28 ++++-- .../API/terminal/TestEditlineCompletions.py | 2 +- 3 files changed, 111 insertions(+), 7 deletions(-) diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index afc1753e21c46..0198ddcfa31e0 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -316,6 +316,11 @@ void CommandInterpreter::Initialize() { AddAlias("continue", cmd_obj_sp); } + // At this point, I'm leaving "b" command aliased to "_regexp-break". There's + // a catch-all regexp in the command that takes any unrecognized input and + // runs it as `break set ` and switching the command to break add + // would change that behavior. People who want to use the break add for the + // "b" alias can do so in their .lldbinit. cmd_obj_sp = GetCommandSPExact("_regexp-break"); if (cmd_obj_sp) AddAlias("b", cmd_obj_sp)->SetSyntax(cmd_obj_sp->GetSyntax()); @@ -668,6 +673,89 @@ void CommandInterpreter::LoadCommandDictionary() { } } + // clang-format off + // FIXME: It would be simpler to just use the linespec's directly here, but + // the `b` alias allows "foo.c : 12 : 45" but the linespec parser + // is more rigorous, and doesn't strip spaces, so the two are not equivalent. + const char *break_add_regexes[][2] = { + {"^(.*[^[:space:]])[[:space:]]*:[[:space:]]*([[:digit:]]+)[[:space:]]*:[[:space:]]*([[:digit:]]+)[[:space:]]*$", + "breakpoint add file --file '%1' --line %2 --column %3"}, + {"^(.*[^[:space:]])[[:space:]]*:[[:space:]]*([[:digit:]]+)[[:space:]]*$", + "breakpoint add file --file '%1' --line %2"}, + {"^/([^/]+)/$", "breakpoint add pattern -- %1"}, + {"^([[:digit:]]+)[[:space:]]*$", + "breakpoint add file --line %1"}, + {"^\\*?(0x[[:xdigit:]]+)[[:space:]]*$", + "breakpoint add address %1"}, + {"^[\"']?([-+]?\\[.*\\])[\"']?[[:space:]]*$", + "breakpoint add name '%1'"}, + {"^(-.*)$", + "breakpoint add name '%1'"}, + {"^(.*[^[:space:]])`(.*[^[:space:]])[[:space:]]*$", + "breakpoint add name '%2' --shlib '%1'"}, + {"^\\&(.*[^[:space:]])[[:space:]]*$", + "breakpoint add name '%1' --skip-prologue=0"}, + {"^[\"']?(.*[^[:space:]\"'])[\"']?[[:space:]]*$", + "breakpoint add name '%1'"}}; + // clang-format on + + size_t num_add_regexes = std::size(break_add_regexes); + + std::unique_ptr break_add_regex_cmd_up( + new CommandObjectRegexCommand( + *this, "_regexp-break-add", + "Set a breakpoint using one of several shorthand formats, or list " + "the existing breakpoints if no arguments are provided.", + "\n" + "_regexp-break-add ::\n" + " main.c:12:21 // Break at line 12 and column " + "21 of main.c\n\n" + "_regexp-break-add :\n" + " main.c:12 // Break at line 12 of " + "main.c\n\n" + "_regexp-break-add \n" + " 12 // Break at line 12 of current " + "file\n\n" + "_regexp-break-add 0x
\n" + " 0x1234000 // Break at address " + "0x1234000\n\n" + "_regexp-break-add \n" + " main // Break in 'main' after the " + "prologue\n\n" + "_regexp-break-add &\n" + " &main // Break at first instruction " + "in 'main'\n\n" + "_regexp-break-add `\n" + " libc.so`malloc // Break in 'malloc' from " + "'libc.so'\n\n" + "_regexp-break-add //\n" + " /break here/ // Break on source lines in " + "current file\n" + " // containing text 'break " + "here'.\n" + "_regexp-break-add\n" + " // List the existing " + "breakpoints\n", + lldb::eSymbolCompletion | lldb::eSourceFileCompletion, false)); + + if (break_add_regex_cmd_up) { + bool success = true; + for (size_t i = 0; i < num_add_regexes; i++) { + success = break_add_regex_cmd_up->AddRegexCommand( + break_add_regexes[i][0], break_add_regexes[i][1]); + if (!success) + break; + } + success = + break_add_regex_cmd_up->AddRegexCommand("^$", "breakpoint list --full"); + + if (success) { + CommandObjectSP break_add_regex_cmd_sp(break_add_regex_cmd_up.release()); + m_command_dict[std::string(break_add_regex_cmd_sp->GetCommandName())] = + break_add_regex_cmd_sp; + } + } + std::unique_ptr tbreak_regex_cmd_up( new CommandObjectRegexCommand( *this, "_regexp-tbreak", diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py index 235a41d1adef3..930d497032171 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestRegexpBreakCommand.py @@ -10,10 +10,15 @@ class RegexpBreakCommandTestCase(TestBase): - def test(self): + def test_set_version(self): """Test _regexp-break command.""" self.build() - self.regexp_break_command() + self.regexp_break_command("_regexp-break") + + def test_add_version(self): + """Test _regexp-break-add command.""" + self.build() + self.regexp_break_command("_regexp-break-add") def setUp(self): # Call super's setUp(). @@ -22,12 +27,12 @@ def setUp(self): self.source = "main.c" self.line = line_number(self.source, "// Set break point at this line.") - def regexp_break_command(self): + def regexp_break_command(self, cmd_name): """Test the super consie "b" command, which is analias for _regexp-break.""" exe = self.getBuildArtifact("a.out") self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) - break_results = lldbutil.run_break_set_command(self, "b %d" % self.line) + break_results = lldbutil.run_break_set_command(self, f"{cmd_name} {self.line}") lldbutil.check_breakpoint_result( self, break_results, @@ -37,7 +42,7 @@ def regexp_break_command(self): ) break_results = lldbutil.run_break_set_command( - self, "b %s:%d" % (self.source, self.line) + self, f"{cmd_name} {self.source}:{self.line}" ) lldbutil.check_breakpoint_result( self, @@ -50,7 +55,7 @@ def regexp_break_command(self): # Check breakpoint with full file path. full_path = os.path.join(self.getSourceDir(), self.source) break_results = lldbutil.run_break_set_command( - self, "b %s:%d" % (full_path, self.line) + self, f"{cmd_name} {full_path}:{self.line}" ) lldbutil.check_breakpoint_result( self, @@ -60,6 +65,17 @@ def regexp_break_command(self): num_locations=1, ) + # Check breakpoint with symbol name. I'm also passing in + # the module so I can check the number of locations. + exe_spec = lldb.SBFileSpec(exe) + exe_filename = exe_spec.basename + cmd = f"{cmd_name} {exe_filename}`main" + print(f"About to run: '{cmd}'") + break_results = lldbutil.run_break_set_command(self, cmd) + lldbutil.check_breakpoint_result( + self, break_results, symbol_name="main", num_locations=1 + ) + self.runCmd("run", RUN_SUCCEEDED) # The stop reason of the thread should be breakpoint. diff --git a/lldb/test/API/terminal/TestEditlineCompletions.py b/lldb/test/API/terminal/TestEditlineCompletions.py index b4ea0f39ec10c..ac1d3f90e2970 100644 --- a/lldb/test/API/terminal/TestEditlineCompletions.py +++ b/lldb/test/API/terminal/TestEditlineCompletions.py @@ -72,11 +72,11 @@ def test_completion_pagination(self): self.child.expect("Available completions:") self.child.expect(" _regexp-attach") self.child.expect(" _regexp-break") + self.child.expect(" _regexp-break-add") self.child.expect(" _regexp-bt") self.child.expect(" _regexp-display") self.child.expect(" _regexp-down") self.child.expect(" _regexp-env") - self.child.expect(" _regexp-jump") self.child.expect("More") @skipIfAsan From 3abaed8086fae30726b1ffb6fbbad69b1d42a7d0 Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 10 Dec 2025 16:38:44 -0800 Subject: [PATCH 03/49] [TableGen] Replace reachable assertion with error in *ByHwMode Having duplicate mode entries previously asserted (or silently replaced the last value with a new one in release builds). Report an error with a helpful message instead. Pull Request: https://github.com/llvm/llvm-project/pull/171715 --- llvm/test/TableGen/RegClassByHwModeErrors.td | 21 +++++++++ llvm/utils/TableGen/Common/InfoByHwMode.cpp | 47 ++++++++++++-------- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/llvm/test/TableGen/RegClassByHwModeErrors.td b/llvm/test/TableGen/RegClassByHwModeErrors.td index 0ee6370ccd0ce..c7731312e28a6 100644 --- a/llvm/test/TableGen/RegClassByHwModeErrors.td +++ b/llvm/test/TableGen/RegClassByHwModeErrors.td @@ -9,6 +9,8 @@ // RUN: %t/compress-regclass-by-hwmode-2.td -o /dev/null 2>&1 | FileCheck %t/compress-regclass-by-hwmode-2.td --implicit-check-not="error:" // RUN: not llvm-tblgen --gen-dag-isel -I %p/../../include -I %t -I %S \ // RUN: %t/vt-by-hwmode-missing.td -o /dev/null 2>&1 | FileCheck %t/vt-by-hwmode-missing.td --implicit-check-not="error:" +// RUN: not llvm-tblgen --gen-dag-isel -I %p/../../include -I %t -I %S \ +// RUN: %t/multiple-entries-for-same-mode.td -o /dev/null 2>&1 | FileCheck %t/multiple-entries-for-same-mode.td --implicit-check-not="error:" //--- Common.td include "Common/RegClassByHwModeCommon.td" @@ -119,3 +121,22 @@ def TEST : TestInstruction { def MyTargetISA : InstrInfo; def MyTarget : Target { let InstructionSet = MyTargetISA; } + + +//--- multiple-entries-for-same-mode.td +include "Common.td" +/// We should get an error if the same mode is listed more than once +defvar Ptr64Alias = Ptr64; +def BadRegClass : RegClassByHwMode<[Ptr32, Ptr64, Ptr64Alias], [XRegs, YRegs, YRegs]>; +// CHECK: [[#@LINE-1]]:5: error: duplicate RegisterClass entry for HwMode Ptr64: YRegs +// Need at least one CompressPat use of the bad reg class to trigger the error: +def USE_BAD_REG_CLASS : TestInstruction { + let OutOperandList = (outs BadRegClass:$dst); + let InOperandList = (ins BadRegClass:$src1, BadRegClass:$src2); + let AsmString = "bad $dst"; + let Pattern = [(set BadRegClass:$dst, (add BadRegClass:$src1, BadRegClass:$src2))]; +} +def MyTargetISA : InstrInfo; +def MyTarget : Target { + let InstructionSet = MyTargetISA; +} diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp index a3f8909c36090..4ab27a610249d 100644 --- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp +++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" #include @@ -32,10 +33,12 @@ std::string llvm::getModeName(unsigned Mode) { ValueTypeByHwMode::ValueTypeByHwMode(const Record *R, const CodeGenHwModes &CGH) : InfoByHwMode(R) { const HwModeSelect &MS = CGH.getHwModeSelect(R); - for (const HwModeSelect::PairType &P : MS.Items) { - auto I = Map.try_emplace(P.first, MVT(llvm::getValueType(P.second))); - assert(I.second && "Duplicate entry?"); - (void)I; + for (auto [ModeID, VT] : MS.Items) { + assert(VT && VT->isSubClassOf("ValueType")); + if (!Map.try_emplace(ModeID, MVT(llvm::getValueType(VT))).second) + PrintFatalError(R->getLoc(), "duplicate ValueType entry for HwMode " + + CGH.getModeName(ModeID, true) + ": " + + VT->getName()); } if (R->isSubClassOf("PtrValueType")) PtrAddrSpace = R->getValueAsInt("AddrSpace"); @@ -143,10 +146,12 @@ RegSizeInfoByHwMode::RegSizeInfoByHwMode(const Record *R, const CodeGenHwModes &CGH) : InfoByHwMode(R) { const HwModeSelect &MS = CGH.getHwModeSelect(R); - for (const HwModeSelect::PairType &P : MS.Items) { - auto I = Map.try_emplace(P.first, RegSizeInfo(P.second)); - assert(I.second && "Duplicate entry?"); - (void)I; + for (auto [ModeID, RegInfo] : MS.Items) { + assert(RegInfo && RegInfo->isSubClassOf("RegInfo")); + if (!Map.try_emplace(ModeID, RegSizeInfo(RegInfo)).second) + PrintFatalError(R->getLoc(), "duplicate RegInfo entry for HwMode " + + CGH.getModeName(ModeID, true) + ": " + + RegInfo->getName()); } } @@ -198,7 +203,9 @@ RegClassByHwMode::RegClassByHwMode(const Record *R, const CodeGenHwModes &CGH, "Register class must subclass RegisterClass"); const CodeGenRegisterClass *RegClass = RegBank.getRegClass(RegClassRec); if (!Map.try_emplace(ModeID, RegClass).second) - llvm_unreachable("duplicate entry"); + PrintFatalError(R->getLoc(), "duplicate RegisterClass entry for HwMode " + + CGH.getModeName(ModeID, true) + ": " + + RegClass->getName()); } } @@ -211,10 +218,12 @@ SubRegRangeByHwMode::SubRegRangeByHwMode(const Record *R, const CodeGenHwModes &CGH) : InfoByHwMode(R) { const HwModeSelect &MS = CGH.getHwModeSelect(R); - for (const HwModeSelect::PairType &P : MS.Items) { - auto I = Map.try_emplace(P.first, SubRegRange(P.second)); - assert(I.second && "Duplicate entry?"); - (void)I; + for (auto [ModeID, Range] : MS.Items) { + assert(Range && Range->isSubClassOf("SubRegRange")); + if (!Map.try_emplace(ModeID, SubRegRange(Range)).second) + PrintFatalError(R->getLoc(), "duplicate SubRegRange entry for HwMode " + + CGH.getModeName(ModeID, true) + ": " + + Range->getName()); } } @@ -222,12 +231,14 @@ EncodingInfoByHwMode::EncodingInfoByHwMode(const Record *R, const CodeGenHwModes &CGH) : InfoByHwMode(R) { const HwModeSelect &MS = CGH.getHwModeSelect(R); - for (const HwModeSelect::PairType &P : MS.Items) { - assert(P.second && P.second->isSubClassOf("InstructionEncoding") && + for (auto [ModeID, Encoding] : MS.Items) { + assert(Encoding && Encoding->isSubClassOf("InstructionEncoding") && "Encoding must subclass InstructionEncoding"); - auto I = Map.try_emplace(P.first, P.second); - assert(I.second && "Duplicate entry?"); - (void)I; + if (!Map.try_emplace(ModeID, Encoding).second) + PrintFatalError(R->getLoc(), + "duplicate InstructionEncoding entry for HwMode " + + CGH.getModeName(ModeID, true) + ": " + + Encoding->getName()); } } From c5470e0f1f51291ac2b59c27a8278494e9f67ebd Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 10 Dec 2025 16:39:14 -0800 Subject: [PATCH 04/49] [RISC-V][MC] Fix tied operand register class mismatch in P-extension I have a change to validate the operand classes emitted in the AsmParser and that caused llvm/test/MC/RISCV/rv32p-valid.s to fail due to the rd_wb register using a different register class from rd: `PWADDA_H operand 1 register X6 is not a member of register class GPRPair` This happens because tablegen's AsmMatcherEmitter emits code to literally copy over the tied registers and does not feed them through the equivalent of RISCVAsmParser::validateTargetOperandClass() which would allow adjusting these operand classes. Ideally we would handle this in tablegen (or at least add an error), but the tied operand handling logic is rather complex and I don't understand it yet. For now just update the rd register class to match rd_wb. Pull Request: https://github.com/llvm/llvm-project/pull/171738 --- llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index bba9f961b9639..7250a48bfe895 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -437,7 +437,7 @@ class RVPTernary_rrr f, bits<2> w, bits<3> funct3, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVPWideningTernary_rrr f, bits<2> w, string opcodestr> : RVPWideningBase { + (ins GPRPairRV32:$rd, GPR:$rs1, GPR:$rs2), opcodestr> { let Inst{30-27} = f; let Constraints = "$rd = $rd_wb"; From 62aaa3a9ac04763b8a5ba91d6f3d8754e4db7e85 Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Wed, 10 Dec 2025 19:40:22 -0500 Subject: [PATCH 05/49] [compiler-rt] follow-up to 166837, rename COMPILER_RT_FORCE_TEST_BUILTINS_DIR to COMPILER_RT_TEST_BUILTINS_DIR (#171741) Co-authored-by: David Tenty --- llvm/runtimes/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 658db4f2babc9..c8febd79b2484 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -265,7 +265,7 @@ function(runtime_default_target) list(APPEND test_targets runtimes-test-depends check-runtimes check-builtins) # The default runtimes target can run tests the default builtins target - list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_FORCE_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-bins/") + list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-bins/") endif() set_enable_per_target_runtime_dir() @@ -376,7 +376,7 @@ function(runtime_register_target name) # If a builtins-${name} target exists, we'll test those builtins # with this runtimes build if(TARGET builtins-${name}) - list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_FORCE_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-${name}-bins/") + list(APPEND ARG_CMAKE_ARGS "-DCOMPILER_RT_TEST_BUILTINS_DIR=${LLVM_BINARY_DIR}/runtimes/builtins-${name}-bins/") set(check-builtins-${name} check-builtins) list(APPEND ${name}_test_targets check-builtins-${name}) list(APPEND test_targets check-builtins-${name}) From 16e6055273e14bd9b8402a6c0e7f3793e287ea0e Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Wed, 10 Dec 2025 16:49:59 -0800 Subject: [PATCH 06/49] =?UTF-8?q?Revert=20"[SelectionDAG]=20Salvage=20debu?= =?UTF-8?q?ginfo=20when=20combining=20load=20and=20sext=E2=80=A6=20(#17174?= =?UTF-8?q?5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … instrs. (#169779)" This reverts commit 2b958b9ee24b8ea36dcc777b2d1bcfb66c4972b6. I might have broken the sanitizer-x86_64-linux bot /home/b/sanitizer-x86_64-linux/build/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_linux.cpp clang++: /home/b/sanitizer-x86_64-linux/build/llvm-project/llvm/include/llvm/ADT/ArrayRef.h:248: const T &llvm::ArrayRef::operator[](size_t) const [T = llvm::DbgValueLocEntry]: Assertion `Index < Length && "Invalid index!"' failed. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 41 +---------- .../X86/selectionDAG-load-sext-trunc.ll | 70 ------------------- .../DebugInfo/X86/selectionDAG-load-sext.ll | 61 ---------------- 3 files changed, 2 insertions(+), 170 deletions(-) delete mode 100644 llvm/test/DebugInfo/X86/selectionDAG-load-sext-trunc.ll delete mode 100644 llvm/test/DebugInfo/X86/selectionDAG-load-sext.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 569ab4cfa3efe..6a99d4e29b64f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -51,7 +51,6 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Metadata.h" @@ -79,7 +78,6 @@ #include #include "MatchContext.h" -#include "SDNodeDbgValue.h" using namespace llvm; using namespace llvm::SDPatternMatch; @@ -14467,44 +14465,10 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc); - unsigned Opcode = N->getOpcode(); - bool IsSigned = Opcode == ISD::SIGN_EXTEND; // If the load value is used only by N, replace it via CombineTo N. - SDValue OldLoadVal(LN0, 0); - SDValue OldExtValue(N, 0); - bool NoReplaceTrunc = OldLoadVal.hasOneUse(); - - // Because we are replacing a load and a s|z ext with a load-s|z ext - // instruction, the dbg_value attached to the load will be of a smaller bit - // width, and we have to add a DW_OP_LLVM_convert expression to get the - // correct size. - auto SalvageToOldLoadSize = [&](SDValue From, SDValue To, bool IsSigned) { - for (SDDbgValue *Dbg : DAG.GetDbgValues(From.getNode())) { - unsigned VarBitsFrom = From->getValueSizeInBits(0); - unsigned VarBitsTo = To->getValueSizeInBits(0); - - // Build a convert expression for the s|z extend. - const DIExpression *OldE = Dbg->getExpression(); - auto *NewE = - DIExpression::appendExt(OldE, VarBitsFrom, VarBitsTo, IsSigned); - - // Create a new SDDbgValue that points at the widened node with the - // fragment. - Dbg->setIsInvalidated(); - Dbg->setIsEmitted(); - SDDbgValue *NewDV = DAG.getDbgValue( - Dbg->getVariable(), NewE, To.getNode(), To.getResNo(), - Dbg->isIndirect(), Dbg->getDebugLoc(), Dbg->getOrder()); - DAG.AddDbgValue(NewDV, /*isParametet*/ false); - } - }; - + bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); + Combiner.CombineTo(N, ExtLoad); if (NoReplaceTrunc) { - if (LN0->getHasDebugValue()) - SalvageToOldLoadSize(OldLoadVal, ExtLoad, IsSigned); - - if (N->getHasDebugValue()) - DAG.transferDbgValues(OldExtValue, ExtLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); Combiner.recursivelyDeleteUnusedNodes(LN0); } else { @@ -14512,7 +14476,6 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1)); } - Combiner.CombineTo(N, ExtLoad); return SDValue(N, 0); // Return N so it doesn't get rechecked! } diff --git a/llvm/test/DebugInfo/X86/selectionDAG-load-sext-trunc.ll b/llvm/test/DebugInfo/X86/selectionDAG-load-sext-trunc.ll deleted file mode 100644 index 46afb9c598a74..0000000000000 --- a/llvm/test/DebugInfo/X86/selectionDAG-load-sext-trunc.ll +++ /dev/null @@ -1,70 +0,0 @@ -; This test checks that after SelectionDAG runs, it preserves the debug info that is lost due to the DAGCombiner combining a load and a sext instruction, where the #dbg_value is pointing to the result of the load. -; However, in this case, the load has multiple uses. - -; RUN: llc %s -mtriple=x86_64-unkown-linux -start-before=x86-isel -stop-after=x86-isel -o - | FileCheck %s --check-prefix=MIR -; RUN: llc -O2 %s -start-before=x86-isel -mtriple=x86_64-unkown-linux --filetype=obj -o %t.o -; RUN: llvm-dwarfdump %t.o --name Idx | FileCheck %s --check-prefix=DUMP -; RUN: llvm-dwarfdump %t.o --name Idx2 | FileCheck %s --check-prefix=DUMP2 - -; MIR: ![[IDX:[0-9]+]] = !DILocalVariable(name: "Idx" -; MIR: ![[IDX2:[0-9]+]] = !DILocalVariable(name: "Idx2" -; MIR: name: _Z8useValuei -; MIR: name: main -; MIR: debugValueSubstitutions -; MIR-NEXT: - { srcinst: [[INSTR_NUM2:[0-9]+]], srcop: 0, dstinst: [[INSTR_NUM:[0-9]+]], dstop: 0, subreg: 6 } -; MIR-LABEL: bb.0 (%ir-block.0) -; MIR: %{{[0-9a-f]+}}{{.*}} = MOVSX64rm32 ${{.*}}, 1, $noreg, @GlobArr, $noreg, debug-instr-number [[INSTR_NUM]] -; MIR-NEXT: {{.*}} = COPY %0.sub_32bit -; MIR-NEXT DBG_INSTR_REF ![[IDX]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref([[INSTR_NUM2]], 0) -; MIR-NEXT DBG_INSTR_REF ![[IDX2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref([[INSTR_NUM]], 0) - -; DUMP: DW_AT_location (indexed ({{[0-9a-f]+}}x{{[0-9a-f]+}}) loclist = 0x{{[0-9a-f]+}}: -; DUMP-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_reg3 RBX) - -; DUMP2: DW_AT_location (indexed ({{[0-9a-f]+}}x{{[0-9a-f]+}}) loclist = 0x{{[0-9a-f]+}}: -; DUMP2-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_reg3 RBX) - - - - @GlobArr = dso_local local_unnamed_addr global [5 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5], align 16, !dbg !0 - @__const.main.Data = private unnamed_addr constant [7 x i32] [i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70], align 16 - define dso_local void @_Z8useValuei(i32 noundef %0) local_unnamed_addr #0 !dbg !22 { - ret void, !dbg !28 - } - define dso_local noundef i32 @main() local_unnamed_addr #1 !dbg !29 { - %1 = load i32, ptr @GlobArr - #dbg_value(i32 %1, !43, !DIExpression(), !52) - %2 = sext i32 %1 to i64 - #dbg_value(i64 %2, !57, !DIExpression(), !52) - tail call void @_Z8useValuei(i32 noundef %1), !dbg !56 - %3 = getelementptr inbounds i32, ptr @__const.main.Data, i64 %2 - %4 = load i32, ptr %3 - tail call void @_Z8useValuei(i32 noundef %4), !dbg !56 - ret i32 0 - } - !llvm.dbg.cu = !{!2} - !llvm.module.flags = !{!10, !11, !16} - !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) - !1 = distinct !DIGlobalVariable(type: !6, isDefinition: true) - !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, emissionKind: FullDebug, nameTableKind: None) - !3 = !DIFile(filename: "/tmp/test.cpp", directory: "/Users/srastogi/Development/llvm-project/build_ninja", checksumkind: CSK_MD5, checksum: "0fe735937e606b4db3e3b2e9253eff90") - !6 = !DICompositeType(tag: DW_TAG_array_type, elements: !8) - !7 = !DIBasicType() - !8 = !{} - !10 = !{i32 7, !"Dwarf Version", i32 5} - !11 = !{i32 2, !"Debug Info Version", i32 3} - !16 = !{i32 7, !"debug-info-assignment-tracking", i1 true} - !22 = distinct !DISubprogram(type: !23, unit: !2, keyInstructions: true) - !23 = !DISubroutineType(types: !24) - !24 = !{} - !28 = !DILocation(scope: !22, atomRank: 1) - !29 = distinct !DISubprogram(type: !30, unit: !2, keyInstructions: true) - !30 = !DISubroutineType(types: !31) - !31 = !{} - !38 = distinct !DILexicalBlock(scope: !29, line: 5, column: 3) - !43 = !DILocalVariable(name: "Idx", scope: !44, type: !7) - !44 = distinct !DILexicalBlock(scope: !38, line: 5, column: 3) - !46 = distinct !DILexicalBlock(scope: !44, line: 5, column: 27) - !52 = !DILocation(scope: !44) - !56 = !DILocation(scope: !46) - !57 = !DILocalVariable(name: "Idx2", scope: !44, type: !7) diff --git a/llvm/test/DebugInfo/X86/selectionDAG-load-sext.ll b/llvm/test/DebugInfo/X86/selectionDAG-load-sext.ll deleted file mode 100644 index 7e61780a6ab13..0000000000000 --- a/llvm/test/DebugInfo/X86/selectionDAG-load-sext.ll +++ /dev/null @@ -1,61 +0,0 @@ -; This test checks that after SelectionDAG runs, it preserves the debug info that is lost due to the DAGCombiner combining a load and a sext instruction, where the #dbg_value is pointing to the result of the load. -; RUN: llc %s -mtriple=x86_64-unkown-linux -start-before=x86-isel -stop-after=x86-isel -o - | FileCheck %s --check-prefix=MIR -; RUN: llc -O2 %s -start-before=x86-isel -mtriple=x86_64-unkown-linux --filetype=obj -o %t.o -; RUN: llvm-dwarfdump %t.o --name Idx | FileCheck %s --check-prefix=DUMP -; RUN: llvm-dwarfdump %t.o --name Idx2 | FileCheck %s --check-prefix=DUMP2 - -; MIR: ![[IDX:[0-9]+]] = !DILocalVariable(name: "Idx" -; MIR: ![[IDX2:[0-9]+]] = !DILocalVariable(name: "Idx2" -; MIR-LABEL: bb.0 -; MIR: %{{[0-9a-f]+}}{{.*}} = MOVSX64rm32 ${{.*}}, 1, $noreg, @GlobArr, $noreg, debug-instr-number [[INSTR_NUM:[0-9]+]] -; MIR-NEXT: DBG_INSTR_REF ![[IDX]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_LLVM_convert, 64, DW_ATE_signed, DW_OP_stack_value), dbg-instr-ref([[INSTR_NUM]], 0) -; MIR-NEXT: DBG_INSTR_REF ![[IDX2]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref([[INSTR_NUM]], 0) - -; DUMP: DW_AT_location (indexed ({{[0-9a-f]+}}x{{[0-9a-f]+}}) loclist = 0x{{[0-9a-f]+}}: -; DUMP-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_breg0 RAX+0, DW_OP_convert (0x{{[0-9a-f]+}}) "DW_ATE_signed_32", DW_OP_convert (0x{{[0-9a-f]+}}) "DW_ATE_signed_64", DW_OP_stack_value) - -; DUMP2: DW_AT_location (indexed ({{[0-9a-f]+}}x{{[0-9a-f]+}}) loclist = 0x{{[0-9a-f]+}}: -; DUMP2-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_reg0 RAX) - - - @GlobArr = dso_local local_unnamed_addr global [5 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5], align 16, !dbg !0 - @__const.main.Data = private unnamed_addr constant [7 x i32] [i32 10, i32 20, i32 30, i32 40, i32 50, i32 60, i32 70], align 16 - define dso_local void @_Z8useValuei(i32 noundef %0) local_unnamed_addr #0 !dbg !22 { - ret void, !dbg !28 - } - define dso_local noundef i32 @main() local_unnamed_addr #1 !dbg !29 { - %1 = load i32, ptr @GlobArr - #dbg_value(i32 %1, !43, !DIExpression(), !52) - %2 = sext i32 %1 to i64 - #dbg_value(i64 %2, !57, !DIExpression(), !52) - %3 = getelementptr inbounds i32, ptr @__const.main.Data, i64 %2 - %4 = load i32, ptr %3 - tail call void @_Z8useValuei(i32 noundef %4), !dbg !56 - ret i32 0 - } - !llvm.dbg.cu = !{!2} - !llvm.module.flags = !{!10, !11, !16} - !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) - !1 = distinct !DIGlobalVariable(type: !6, isDefinition: true) - !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, emissionKind: FullDebug, nameTableKind: None) - !3 = !DIFile(filename: "/tmp/test.cpp", directory: "/Users/srastogi/Development/llvm-project/build_ninja", checksumkind: CSK_MD5, checksum: "0fe735937e606b4db3e3b2e9253eff90") - !6 = !DICompositeType(tag: DW_TAG_array_type, elements: !8) - !7 = !DIBasicType() - !8 = !{} - !10 = !{i32 7, !"Dwarf Version", i32 5} - !11 = !{i32 2, !"Debug Info Version", i32 3} - !16 = !{i32 7, !"debug-info-assignment-tracking", i1 true} - !22 = distinct !DISubprogram(type: !23, unit: !2, keyInstructions: true) - !23 = !DISubroutineType(types: !24) - !24 = !{} - !28 = !DILocation(scope: !22, atomRank: 1) - !29 = distinct !DISubprogram(type: !30, unit: !2, keyInstructions: true) - !30 = !DISubroutineType(types: !31) - !31 = !{} - !38 = distinct !DILexicalBlock(scope: !29, line: 5, column: 3) - !43 = !DILocalVariable(name: "Idx", scope: !44, type: !7) - !44 = distinct !DILexicalBlock(scope: !38, line: 5, column: 3) - !46 = distinct !DILexicalBlock(scope: !44, line: 5, column: 27) - !52 = !DILocation(scope: !44) - !56 = !DILocation(scope: !46) - !57 = !DILocalVariable(name: "Idx2", scope: !44, type: !7) From ba73d60c43c1c2af5111669f5ec938f1c501fd4a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Dec 2025 17:46:10 -0800 Subject: [PATCH 07/49] [RISCV] Use sew and vec_policy for Rivos vector instruction operands. (#171721) This enables MachineVerifier and MachineIR printing support for these operands. --- llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td index 3a6ce3ce1d469..39a7aeda94707 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td @@ -156,7 +156,7 @@ foreach m = MxList in { let BaseInstr = RI_VEXTRACT in def PseudoRI_VEXTRACT_ # mx : RISCVVPseudo<(outs GPR:$rd), - (ins m.vrclass:$rs2, uimm5:$idx, ixlenimm:$sew), + (ins m.vrclass:$rs2, uimm5:$idx, sew:$sew), []>; let HasVLOp = 1, BaseInstr = RI_VINSERT, HasVecPolicyOp = 1, @@ -164,7 +164,7 @@ foreach m = MxList in { def PseudoRI_VINSERT_ # mx : RISCVVPseudo<(outs m.vrclass:$rd), (ins m.vrclass:$rs1, GPR:$rs2, uimm5:$idx, AVL:$vl, - ixlenimm:$sew, ixlenimm:$policy), + sew:$sew, vec_policy:$policy), []>; } } From 76ae530407a42f50fa946ccaca5c1039fff5c7ca Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Thu, 11 Dec 2025 02:04:25 +0000 Subject: [PATCH 08/49] [PPC] XFAIL ppc/fixtfti_test.c and ppc/fixunstfti_test.c and track them under issue 171751 --- compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c | 1 + compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c | 1 + 2 files changed, 2 insertions(+) diff --git a/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c b/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c index 8b86d7879f7e7..f0c9fcd30c07e 100644 --- a/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c +++ b/compiler-rt/test/builtins/Unit/ppc/fixtfti_test.c @@ -1,3 +1,4 @@ +// XFAIL: * // REQUIRES: target-is-powerpc64le // RUN: %clang_builtins %s %librt -o %t && %run %t diff --git a/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c b/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c index 0eee31db1b5dd..fe706986a345d 100644 --- a/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c +++ b/compiler-rt/test/builtins/Unit/ppc/fixunstfti_test.c @@ -1,3 +1,4 @@ +// XFAIL: * // REQUIRES: target-is-powerpc64le // RUN: %clang_builtins %s %librt -o %t && %run %t From 1f07f7c75a987865a6b428c502983de436c8ca41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 10 Dec 2025 18:05:12 -0800 Subject: [PATCH 09/49] [flang][cuda] Add support for allocate with device source (#171743) Add support for allocate statement with a source that is a device variable. --- flang-rt/lib/cuda/allocatable.cpp | 24 ++++++++++++------- flang-rt/lib/cuda/pointer.cpp | 14 ++++++----- .../flang/Optimizer/Dialect/CUF/CUFOps.td | 5 ++-- .../include/flang/Runtime/CUDA/allocatable.h | 6 +++-- flang/include/flang/Runtime/CUDA/pointer.h | 6 +++-- flang/lib/Lower/Allocatable.cpp | 13 +++++++--- .../CUDA/CUFAllocationConversion.cpp | 6 +++-- flang/test/Fir/CUDA/cuda-allocate.fir | 7 ++++-- .../CUDA/TODO/cuda-allocate-source-device.cuf | 9 ------- flang/test/Lower/CUDA/cuda-allocatable.cuf | 9 +++++++ 10 files changed, 63 insertions(+), 36 deletions(-) delete mode 100644 flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp index ff1a225d66ce9..662703dfb6321 100644 --- a/flang-rt/lib/cuda/allocatable.cpp +++ b/flang-rt/lib/cuda/allocatable.cpp @@ -57,26 +57,34 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream, int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { int stat{RTNAME(CUFAllocatableAllocate)( alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - alloc, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(alloc, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { - int stat{RTNAME(CUFAllocatableAllocateSync)( - alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { + int stat; + if (sourceIsDevice) { + stat = RTNAME(CUFAllocatableAllocate)( + alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine); + } else { + stat = RTNAME(CUFAllocatableAllocateSync)( + alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine); + } if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - alloc, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(alloc, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp index d3f5cfe8e96a1..f07b1a9b60924 100644 --- a/flang-rt/lib/cuda/pointer.cpp +++ b/flang-rt/lib/cuda/pointer.cpp @@ -56,26 +56,28 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream, int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { int stat{RTNAME(CUFPointerAllocate)( pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - pointer, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(pointer, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { int stat{RTNAME(CUFPointerAllocateSync)( pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - pointer, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(pointer, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index 636879f28a2fb..34ac21c51b933 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -100,8 +100,9 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments, Optional:$stream, Arg, "", [MemWrite]>:$pinned, Arg, "", [MemRead]>:$source, - cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat, - UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer); + OptionalAttr:$data_attr, UnitAttr:$hasStat, + UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer, + UnitAttr:$device_source); let results = (outs AnyIntegerType:$stat); diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h index 6c97afa9e10e8..97f24bc34bfb8 100644 --- a/flang/include/flang/Runtime/CUDA/allocatable.h +++ b/flang/include/flang/Runtime/CUDA/allocatable.h @@ -34,14 +34,16 @@ int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr, int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); /// Perform allocation of the descriptor with synchronization of it when /// necessary. Assign data from source. int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); /// Perform deallocation of the descriptor with synchronization of it when /// necessary. diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h index bdfc3268e0814..b845fd59114d4 100644 --- a/flang/include/flang/Runtime/CUDA/pointer.h +++ b/flang/include/flang/Runtime/CUDA/pointer.h @@ -34,14 +34,16 @@ int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr, int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); /// Perform allocation of the descriptor with synchronization of it when /// necessary. Assign data from source. int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); } // extern "C" diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index c9a9d935bd615..030439550cd15 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -629,9 +629,10 @@ class AllocateStmtHelper { unsigned allocatorIdx = Fortran::lower::getAllocatorIdx(alloc.getSymbol()); fir::ExtendedValue exv = isSource ? sourceExv : moldExv; + bool sourceIsDevice = false; if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)}) if (Fortran::semantics::IsCUDADevice(*sym)) - TODO(loc, "CUDA Fortran: allocate with device source"); + sourceIsDevice = true; // Generate a sequence of runtime calls. errorManager.genStatCheck(builder, loc); @@ -651,7 +652,7 @@ class AllocateStmtHelper { genSetDeferredLengthParameters(alloc, box); genAllocateObjectBounds(alloc, box); mlir::Value stat; - if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) { + if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()) || sourceIsDevice) { stat = genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol()); } else { @@ -798,13 +799,19 @@ class AllocateStmtHelper { // Keep return type the same as a standard AllocatableAllocate call. mlir::Type retTy = fir::runtime::getModel()(builder.getContext()); + bool isSourceDevice = false; + if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)}) + if (Fortran::semantics::IsCUDADevice(*sym)) + isSourceDevice = true; + bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr()); return cuf::AllocateOp::create( builder, loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr, errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr, doubleDescriptors ? builder.getUnitAttr() : nullptr, - box.isPointer() ? builder.getUnitAttr() : nullptr) + box.isPointer() ? builder.getUnitAttr() : nullptr, + isSourceDevice ? builder.getUnitAttr() : nullptr) .getResult(); } diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp index 6579c2362cd87..4444fc61239ea 100644 --- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp @@ -99,7 +99,6 @@ static mlir::LogicalResult convertOpToCall(OpTy op, mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true) : builder.createBool(loc, false); - mlir::Value errmsg; if (op.getErrmsg()) { errmsg = op.getErrmsg(); @@ -116,12 +115,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op, loc, fir::ReferenceType::get( mlir::IntegerType::get(op.getContext(), 1))); if (op.getSource()) { + mlir::Value isDeviceSource = op.getDeviceSource() + ? builder.createBool(loc, true) + : builder.createBool(loc, false); mlir::Value stream = op.getStream() ? op.getStream() : builder.createNullConstant(loc, fTy.getInput(2)); args = fir::runtime::createArguments( builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned, - hasStat, errmsg, sourceFile, sourceLine); + hasStat, errmsg, sourceFile, sourceLine, isDeviceSource); } else { mlir::Value stream = op.getStream() ? op.getStream() diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index 9d0d181609ada..5184561a03e67 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -128,11 +128,14 @@ func.func @_QPallocate_source() { %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"} + %devsource = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"} %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref>>>) -> !fir.ref>>> %5 = cuf.alloc !fir.box>> {bindc_name = "a_d", data_attr = #cuf.cuda, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref>>> %7 = fir.declare %5 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref>>>) -> !fir.ref>>> %8 = fir.load %4 : !fir.ref>>> %22 = cuf.allocate %7 : !fir.ref>>> source(%8 : !fir.box>>) {data_attr = #cuf.cuda} -> i32 + %9 = fir.load %devsource : !fir.ref>>> + %23 = cuf.allocate %7 : !fir.ref>>> source(%9 : !fir.box>>) {device_source} -> i32 return } @@ -142,8 +145,8 @@ func.func @_QPallocate_source() { // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref>>> // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box>>) -> !fir.box -// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.ref, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 - +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.ref, !fir.ref, i1, !fir.box, !fir.ref, i32, i1) -> i32 +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %true{{.*}}) fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda} : !fir.box>> { %c0 = arith.constant 0 : index diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf deleted file mode 100644 index 3e59e2f01119e..0000000000000 --- a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf +++ /dev/null @@ -1,9 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s - -program main - implicit none - integer, device, allocatable :: a_d(:) - integer, allocatable :: a(:) -! CHECK: not yet implemented: CUDA Fortran: allocate with device source - allocate(a, source=a_d) -end program diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index 43e716532ecca..52303d126b8dc 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -261,3 +261,12 @@ end subroutine ! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda, hasDoubleDescriptor} -> i32 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda, hasDoubleDescriptor} -> i32 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 + +attributes(global) subroutine from_device_source() + real, device, allocatable :: a(:) + real, allocatable :: b(:) + allocate(b, source=a) +end subroutine + +! CHECK-LABEL: func.func @_QPfrom_device_source() +! CHECK: cuf.allocate{{.*}}device_source From 3fdce799cc184bf1b7c60a6845026df6c6e7630b Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Wed, 10 Dec 2025 21:27:21 -0500 Subject: [PATCH 10/49] [libunwind] fix building on Haiku i386 (#171586) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Duval --- libunwind/src/UnwindCursor.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index 33fcd841b2ab0..afa0cae790377 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -41,7 +41,8 @@ #define _LIBUNWIND_CHECK_LINUX_SIGRETURN 1 #endif -#if defined(_LIBUNWIND_TARGET_HAIKU) && defined(_LIBUNWIND_TARGET_X86_64) +#if defined(_LIBUNWIND_TARGET_HAIKU) && \ + (defined(_LIBUNWIND_TARGET_I386) || defined(_LIBUNWIND_TARGET_X86_64)) #include #include #define _LIBUNWIND_CHECK_HAIKU_SIGRETURN 1 @@ -1366,7 +1367,7 @@ class UnwindCursor : public AbstractUnwindCursor{ bool _unwindInfoMissing; bool _isSignalFrame; #if defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) || \ - defined(_LIBUNWIND_TARGET_HAIKU) + defined(_LIBUNWIND_CHECK_HAIKU_SIGRETURN) bool _isSigReturn = false; #endif #ifdef _LIBUNWIND_TRACE_RET_INJECT From 2614af08c1875e7928c6036237e434943730631a Mon Sep 17 00:00:00 2001 From: mitchell Date: Thu, 11 Dec 2025 11:12:47 +0800 Subject: [PATCH 11/49] [Tooling] Fix misleading progress report when files have multiple compile commands (#169640) This patch fixes an issue in progress reporting where the processed item counter could exceed the total item count, leading to confusing outputs like [22/18]. Closes [#169168](https://github.com/llvm/llvm-project/issues/169168) --- clang/lib/Tooling/Tooling.cpp | 31 +++--- clang/unittests/Tooling/ToolingTest.cpp | 133 ++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 12 deletions(-) diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index 1d55f615de8a9..46b2cc1ac99c1 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -97,7 +97,7 @@ static bool ignoreExtraCC1Commands(const driver::Compilation *Compilation) { OffloadCompilation = true; if (Jobs.size() > 1) { - for (auto *A : Actions){ + for (auto *A : Actions) { // On MacOSX real actions may end up being wrapped in BindArchAction if (isa(A)) A = *A->input_begin(); @@ -414,8 +414,8 @@ bool ToolInvocation::run() { Driver->BuildCompilation(llvm::ArrayRef(Argv))); if (!Compilation) return false; - const llvm::opt::ArgStringList *const CC1Args = getCC1Arguments( - &*Diagnostics, Compilation.get()); + const llvm::opt::ArgStringList *const CC1Args = + getCC1Arguments(&*Diagnostics, Compilation.get()); if (!CC1Args) return false; std::unique_ptr Invocation( @@ -498,9 +498,7 @@ void ClangTool::appendArgumentsAdjuster(ArgumentsAdjuster Adjuster) { ArgsAdjuster = combineAdjusters(std::move(ArgsAdjuster), std::move(Adjuster)); } -void ClangTool::clearArgumentsAdjusters() { - ArgsAdjuster = nullptr; -} +void ClangTool::clearArgumentsAdjusters() { ArgsAdjuster = nullptr; } static void injectResourceDir(CommandLineArguments &Args, const char *Argv0, void *MainAddr) { @@ -555,8 +553,9 @@ int ClangTool::run(ToolAction *Action) { } size_t NumOfTotalFiles = AbsolutePaths.size(); - unsigned ProcessedFileCounter = 0; + unsigned CurrentFileIndex = 0; for (llvm::StringRef File : AbsolutePaths) { + ++CurrentFileIndex; // Currently implementations of CompilationDatabase::getCompileCommands can // change the state of the file system (e.g. prepare generated headers), so // this method needs to run right before we invoke the tool, as the next @@ -571,6 +570,7 @@ int ClangTool::run(ToolAction *Action) { FileSkipped = true; continue; } + unsigned CurrentCommandIndexForFile = 0; for (CompileCommand &CompileCommand : CompileCommandsForFile) { // If the 'directory' field of the compilation database is empty, display // an error and use the working directory instead. @@ -617,13 +617,20 @@ int ClangTool::run(ToolAction *Action) { // pass in made-up names here. Make sure this works on other platforms. injectResourceDir(CommandLine, "clang_tool", &StaticSymbol); + ++CurrentCommandIndexForFile; + // FIXME: We need a callback mechanism for the tool writer to output a // customized message for each file. - if (NumOfTotalFiles > 1) - llvm::errs() << "[" + std::to_string(++ProcessedFileCounter) + "/" + - std::to_string(NumOfTotalFiles) + - "] Processing file " + File - << ".\n"; + if (NumOfTotalFiles > 1 || CompileCommandsForFile.size() > 1) { + llvm::errs() << "[" << std::to_string(CurrentFileIndex) << "/" + << std::to_string(NumOfTotalFiles) << "]"; + if (CompileCommandsForFile.size() > 1) { + llvm::errs() << " (" << std::to_string(CurrentCommandIndexForFile) + << "/" << std::to_string(CompileCommandsForFile.size()) + << ")"; + } + llvm::errs() << " Processing file " << File << ".\n"; + } ToolInvocation Invocation(std::move(CommandLine), Action, Files.get(), PCHContainerOps); Invocation.setDiagnosticConsumer(DiagConsumer); diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp index 25e1d67eb2294..9a7559405c43c 100644 --- a/clang/unittests/Tooling/ToolingTest.cpp +++ b/clang/unittests/Tooling/ToolingTest.cpp @@ -20,8 +20,10 @@ #include "clang/Testing/CommandLineArgs.h" #include "clang/Tooling/ArgumentsAdjusters.h" #include "clang/Tooling/CompilationDatabase.h" +#include "clang/Tooling/JSONCompilationDatabase.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/JSON.h" #include "llvm/Support/Path.h" #include "llvm/Support/TargetSelect.h" #include "llvm/TargetParser/Host.h" @@ -1034,5 +1036,136 @@ TEST(runToolOnCode, TestResetDiagnostics) { "void func() { long x; Foo f(x); }")); } +namespace { +struct TestCommand { + llvm::StringRef File; + llvm::StringRef Command; +}; + +std::string runToolWithProgress(llvm::ArrayRef Commands, + llvm::StringRef BaseDir) { + std::string ErrorMessage; + + llvm::json::Array Entries; + for (const auto &Cmd : Commands) { + Entries.push_back(llvm::json::Object{ + {"directory", BaseDir}, {"command", Cmd.Command}, {"file", Cmd.File}}); + } + std::string DatabaseContent; + llvm::raw_string_ostream OS(DatabaseContent); + OS << llvm::json::Value(std::move(Entries)); + + std::unique_ptr Database( + JSONCompilationDatabase::loadFromBuffer(DatabaseContent, ErrorMessage, + JSONCommandLineSyntax::Gnu)); + if (!Database) { + ADD_FAILURE() << "Failed to load compilation database: " << ErrorMessage; + return ""; + } + + std::vector AbsoluteFiles; + for (const auto &Cmd : Commands) { + SmallString<32> NativeFile(BaseDir); + llvm::sys::path::append(NativeFile, Cmd.File); + llvm::sys::path::native(NativeFile); + std::string AbsPath = std::string(NativeFile); + if (AbsoluteFiles.empty() || AbsoluteFiles.back() != AbsPath) { + AbsoluteFiles.push_back(AbsPath); + } + } + + ClangTool Tool(*Database, AbsoluteFiles); + for (const auto &F : AbsoluteFiles) { + Tool.mapVirtualFile(F, "int x;"); + } + + testing::internal::CaptureStderr(); + Tool.run(newFrontendActionFactory().get()); + return testing::internal::GetCapturedStderr(); +} +} // namespace + +TEST(ClangToolTest, ProgressReportSingleFile) { + SmallString<32> BaseDir; + llvm::sys::path::system_temp_directory(false, BaseDir); + llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix); + + EXPECT_TRUE( + runToolWithProgress({{"test.cpp", "clang++ -c test.cpp"}}, BaseDir) + .empty()); +} + +TEST(ClangToolTest, ProgressReportMultipleFiles) { + SmallString<32> BaseDir; + llvm::sys::path::system_temp_directory(false, BaseDir); + llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix); + + std::string Output = + runToolWithProgress({{"test1.cpp", "clang++ -c test1.cpp"}, + {"test2.cpp", "clang++ -c test2.cpp"}}, + BaseDir); + + SmallString<32> NativeFile1(BaseDir); + llvm::sys::path::append(NativeFile1, "test1.cpp"); + llvm::sys::path::native(NativeFile1); + SmallString<32> NativeFile2(BaseDir); + llvm::sys::path::append(NativeFile2, "test2.cpp"); + llvm::sys::path::native(NativeFile2); + + std::string Expected = "[1/2] Processing file " + std::string(NativeFile1) + + ".\n" + "[2/2] Processing file " + + std::string(NativeFile2) + ".\n"; + EXPECT_EQ(Output, Expected); +} + +TEST(ClangToolTest, ProgressReportMultipleCommands) { + SmallString<32> BaseDir; + llvm::sys::path::system_temp_directory(false, BaseDir); + llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix); + + std::string Output = + runToolWithProgress({{"test.cpp", "clang++ -c test.cpp -DCMD1"}, + {"test.cpp", "clang++ -c test.cpp -DCMD2"}}, + BaseDir); + + SmallString<32> NativeFile(BaseDir); + llvm::sys::path::append(NativeFile, "test.cpp"); + llvm::sys::path::native(NativeFile); + std::string Expected = + "[1/1] (1/2) Processing file " + std::string(NativeFile) + ".\n" + + "[1/1] (2/2) Processing file " + std::string(NativeFile) + ".\n"; + EXPECT_EQ(Output, Expected); +} + +TEST(ClangToolTest, ProgressReportMixed) { + SmallString<32> BaseDir; + llvm::sys::path::system_temp_directory(false, BaseDir); + llvm::sys::path::native(BaseDir, llvm::sys::path::Style::posix); + + std::string Output = + runToolWithProgress({{"test1.cpp", "clang++ -c test1.cpp"}, + {"test2.cpp", "clang++ -c test2.cpp -DCMD1"}, + {"test2.cpp", "clang++ -c test2.cpp -DCMD2"}, + {"test3.cpp", "clang++ -c test3.cpp"}}, + BaseDir); + + SmallString<32> NativeFile1(BaseDir); + llvm::sys::path::append(NativeFile1, "test1.cpp"); + llvm::sys::path::native(NativeFile1); + SmallString<32> NativeFile2(BaseDir); + llvm::sys::path::append(NativeFile2, "test2.cpp"); + llvm::sys::path::native(NativeFile2); + SmallString<32> NativeFile3(BaseDir); + llvm::sys::path::append(NativeFile3, "test3.cpp"); + llvm::sys::path::native(NativeFile3); + + std::string Expected = + "[1/3] Processing file " + std::string(NativeFile1) + ".\n" + + "[2/3] (1/2) Processing file " + std::string(NativeFile2) + ".\n" + + "[2/3] (2/2) Processing file " + std::string(NativeFile2) + ".\n" + + "[3/3] Processing file " + std::string(NativeFile3) + ".\n"; + EXPECT_EQ(Output, Expected); +} + } // end namespace tooling } // end namespace clang From fa9247526a4f8f3cdb946f7ed9a39b4e6691a1aa Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Dec 2025 19:44:22 -0800 Subject: [PATCH 12/49] [RISCV] Add an OperandType for ordering for atomic pseudos. (#171744) --- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 2 ++ llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 3 +++ llvm/lib/Target/RISCV/RISCVInstrInfoA.td | 20 +++++++++++++------ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 40b46f503ca53..74066c86d6ebe 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -433,6 +433,8 @@ enum OperandType : unsigned { OPERAND_RTZARG, // Condition code used by select and short forward branch pseudos. OPERAND_COND_CODE, + // Ordering for atomic pseudos. + OPERAND_ATOMIC_ORDERING, // Vector policy operand. OPERAND_VEC_POLICY, // Vector SEW operand. Stores in log2(SEW). diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index ce5a67bd23a9a..76dc57c45fb0b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3025,6 +3025,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_COND_CODE: Ok = Imm >= 0 && Imm < RISCVCC::COND_INVALID; break; + case RISCVOp::OPERAND_ATOMIC_ORDERING: + Ok = isValidAtomicOrdering(Imm); + break; case RISCVOp::OPERAND_VEC_POLICY: Ok = (Imm & (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC)) == Imm; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 5c81a0990a64f..44798b63376f7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -11,6 +11,14 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Operand and SDNode transformation definitions. +//===----------------------------------------------------------------------===// + +def ordering : RISCVOp { + let OperandType = "OPERAND_ATOMIC_ORDERING"; +} + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -244,7 +252,7 @@ defm : AMOPat<"atomic_load_umin_i64", "AMOMINU_D", i64, [IsRV64]>; /// Pseudo AMOs class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch), - (ins GPR:$addr, GPR:$incr, ixlenimm:$ordering), []> { + (ins GPR:$addr, GPR:$incr, ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; let mayLoad = 1; let mayStore = 1; @@ -253,7 +261,7 @@ class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch), class PseudoMaskedAMO : Pseudo<(outs GPR:$res, GPR:$scratch), - (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$ordering), []> { + (ins GPR:$addr, GPR:$incr, GPR:$mask, ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; let mayLoad = 1; let mayStore = 1; @@ -263,7 +271,7 @@ class PseudoMaskedAMO class PseudoMaskedAMOMinMax : Pseudo<(outs GPR:$res, GPR:$scratch1, GPR:$scratch2), (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$sextshamt, - ixlenimm:$ordering), []> { + ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch1," "@earlyclobber $scratch2"; let mayLoad = 1; @@ -273,7 +281,7 @@ class PseudoMaskedAMOMinMax class PseudoMaskedAMOUMinUMax : Pseudo<(outs GPR:$res, GPR:$scratch1, GPR:$scratch2), - (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$ordering), []> { + (ins GPR:$addr, GPR:$incr, GPR:$mask, ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch1," "@earlyclobber $scratch2"; let mayLoad = 1; @@ -419,7 +427,7 @@ defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>; class PseudoCmpXchg : Pseudo<(outs GPR:$res, GPR:$scratch), - (ins GPR:$addr, GPR:$cmpval, GPR:$newval, ixlenimm:$ordering), []> { + (ins GPR:$addr, GPR:$cmpval, GPR:$newval, ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; let mayLoad = 1; let mayStore = 1; @@ -457,7 +465,7 @@ let Predicates = [HasStdExtZalrsc] in { def PseudoMaskedCmpXchg32 : Pseudo<(outs GPR:$res, GPR:$scratch), (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, - ixlenimm:$ordering), []> { + ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch"; let mayLoad = 1; let mayStore = 1; From 18b61373d85a279ac08c8b77fedfc1cf991b3e4c Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Thu, 11 Dec 2025 03:50:04 +0000 Subject: [PATCH 13/49] [NFC] isOSGlibc: musl is not glibc. (#171734) Previously, `isOSGlibc()` was returning true for musl triples as well. This commit changes `isOSGlibc()` to return false for musl triples, and updates all existing `isOSGlibc()` checks to call `isOSGlibc() || isMusl()`, in order to preserve existing behaviour. --- clang/lib/Basic/Targets/PPC.h | 4 ++-- llvm/include/llvm/TargetParser/Triple.h | 2 +- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 2 +- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 3 ++- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 4 ++-- llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 4 ++-- llvm/lib/Target/X86/X86Subtarget.h | 1 + 7 files changed, 11 insertions(+), 9 deletions(-) diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 8313826d88500..664c9e15d8d18 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -368,7 +368,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool supportsCpuSupports() const override { llvm::Triple Triple = getTriple(); // AIX 7.2 is the minimum requirement to support __builtin_cpu_supports(). - return Triple.isOSGlibc() || + return Triple.isOSGlibc() || Triple.isMusl() || (Triple.isOSAIX() && !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR)); } @@ -376,7 +376,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool supportsCpuIs() const override { llvm::Triple Triple = getTriple(); // AIX 7.2 is the minimum requirement to support __builtin_cpu_is(). - return Triple.isOSGlibc() || + return Triple.isOSGlibc() || Triple.isMusl() || (Triple.isOSAIX() && !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR)); } diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 11b76cd183108..9480e7b36dc2c 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -769,7 +769,7 @@ class Triple { bool isOSGlibc() const { return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD || getOS() == Triple::Hurd) && - !isAndroid(); + !isAndroid() && !isMusl(); } /// Tests whether the OS is AIX. diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 28a52ab8b1ae6..87256352faccd 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -2423,7 +2423,7 @@ static bool targetSupportsPAuthRelocation(const Triple &TT, const MCExpr *Target, const MCExpr *DSExpr) { // No released version of glibc supports PAuth relocations. - if (TT.isOSGlibc()) + if (TT.isOSGlibc() || TT.isMusl()) return false; // We emit PAuth constants as IRELATIVE relocations in cases where the diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 366a7b6d0135a..99bef417eaa89 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3180,7 +3180,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } case PPC::PPCLdFixedAddr: { - assert(Subtarget.getTargetTriple().isOSGlibc() && + assert((Subtarget.getTargetTriple().isOSGlibc() || + Subtarget.getTargetTriple().isMusl()) && "Only targets with Glibc expected to contain PPCLdFixedAddr"); int64_t Offset = 0; const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index e7903a72d85bb..9791c1999086b 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1875,8 +1875,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, // For more information see http://people.redhat.com/drepper/tls.pdf if (isNullConstant(Address) && AM.Segment.getNode() == nullptr && !IndirectTlsSegRefs && - (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || - Subtarget->isTargetFuchsia())) { + (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() || + Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) { if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) return true; switch (N->getPointerInfo().getAddrSpace()) { diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 8db3e501f9b7e..ae9d0a162011f 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -546,8 +546,8 @@ unsigned X86TargetLowering::getAddressSpace() const { } static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { - return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || - TargetTriple.isAndroid(); + return TargetTriple.isOSGlibc() || TargetTriple.isMusl() || + TargetTriple.isOSFuchsia() || TargetTriple.isAndroid(); } static Constant* SegmentOffset(IRBuilderBase &IRB, diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 868f41375b96b..3b920bc4ef7c1 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -293,6 +293,7 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); } bool isTargetHurd() const { return TargetTriple.isOSHurd(); } bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); } + bool isTargetMusl() const { return TargetTriple.isMusl(); } bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } From e795b8bdea598fb76f50519ae7d164eff8d1efb7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Dec 2025 20:43:46 -0800 Subject: [PATCH 14/49] [RISCV] Use GPR instead of ixlenimm for sextshamt in PseudoMaskedAMOMinMax. NFC (#171736) This operand is always a register. --- llvm/lib/Target/RISCV/RISCVInstrInfoA.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index 44798b63376f7..f5fd9acd8b303 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -270,8 +270,8 @@ class PseudoMaskedAMO class PseudoMaskedAMOMinMax : Pseudo<(outs GPR:$res, GPR:$scratch1, GPR:$scratch2), - (ins GPR:$addr, GPR:$incr, GPR:$mask, ixlenimm:$sextshamt, - ordering:$ordering), []> { + (ins GPR:$addr, GPR:$incr, GPR:$mask, GPR:$sextshamt, + ordering:$ordering), []> { let Constraints = "@earlyclobber $res,@earlyclobber $scratch1," "@earlyclobber $scratch2"; let mayLoad = 1; From 3b04094f36bf224d499e6a289b07ae193937e977 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Dec 2025 20:47:31 -0800 Subject: [PATCH 15/49] [RISCV] Add Xsfmm vlte and vste intrinsics to getTgtMemIntrinsics. (#171747) Replace dyn_cast with cast. The dyn_cast can never fail now. Previously it never succeeded. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 4 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 54 +++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 8bfdbef39708a..b6b716be35c3e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2608,8 +2608,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { MachineSDNode *TileLoad = CurDAG->getMachineNode(PseudoInst, DL, Node->getVTList(), Operands); - if (auto *MemOp = dyn_cast(Node)) - CurDAG->setNodeMemRefs(TileLoad, {MemOp->getMemOperand()}); + CurDAG->setNodeMemRefs(TileLoad, + {cast(Node)->getMemOperand()}); ReplaceNode(Node, TileLoad); return; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 29fc2ddb818b5..a9819c65c2170 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2196,6 +2196,60 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 5, /*IsStore*/ true, /*IsUnitStrided*/ false); + case Intrinsic::riscv_sf_vlte8: + case Intrinsic::riscv_sf_vlte16: + case Intrinsic::riscv_sf_vlte32: + case Intrinsic::riscv_sf_vlte64: + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = I.getArgOperand(1); + switch (Intrinsic) { + case Intrinsic::riscv_sf_vlte8: + Info.memVT = MVT::i8; + Info.align = Align(1); + break; + case Intrinsic::riscv_sf_vlte16: + Info.memVT = MVT::i16; + Info.align = Align(2); + break; + case Intrinsic::riscv_sf_vlte32: + Info.memVT = MVT::i32; + Info.align = Align(4); + break; + case Intrinsic::riscv_sf_vlte64: + Info.memVT = MVT::i64; + Info.align = Align(8); + break; + } + Info.size = MemoryLocation::UnknownSize; + Info.flags |= MachineMemOperand::MOLoad; + return true; + case Intrinsic::riscv_sf_vste8: + case Intrinsic::riscv_sf_vste16: + case Intrinsic::riscv_sf_vste32: + case Intrinsic::riscv_sf_vste64: + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = I.getArgOperand(1); + switch (Intrinsic) { + case Intrinsic::riscv_sf_vste8: + Info.memVT = MVT::i8; + Info.align = Align(1); + break; + case Intrinsic::riscv_sf_vste16: + Info.memVT = MVT::i16; + Info.align = Align(2); + break; + case Intrinsic::riscv_sf_vste32: + Info.memVT = MVT::i32; + Info.align = Align(4); + break; + case Intrinsic::riscv_sf_vste64: + Info.memVT = MVT::i64; + Info.align = Align(8); + break; + } + Info.size = MemoryLocation::UnknownSize; + Info.flags |= MachineMemOperand::MOStore; + return true; } } From 71bfdd13040328bc83b520d09eee847fd2b7f82c Mon Sep 17 00:00:00 2001 From: Sirraide Date: Thu, 11 Dec 2025 05:54:09 +0100 Subject: [PATCH 16/49] [Clang] Add support for the C `_Defer` TS (#162848) This implements WG14 N3734 (https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3734.pdf), aka `_Defer`; it is currently only supported in C if `-fdefer-ts` is passed. --- .../clang-tidy/bugprone/BranchCloneCheck.cpp | 6 + clang/docs/ReleaseNotes.rst | 5 + clang/include/clang/AST/RecursiveASTVisitor.h | 1 + clang/include/clang/AST/Stmt.h | 52 ++ .../clang/Basic/DiagnosticParseKinds.td | 2 + .../clang/Basic/DiagnosticSemaKinds.td | 16 + clang/include/clang/Basic/IdentifierTable.h | 3 +- clang/include/clang/Basic/LangOptions.def | 1 + clang/include/clang/Basic/StmtNodes.td | 1 + clang/include/clang/Basic/TokenKinds.def | 4 + clang/include/clang/Options/Options.td | 8 + clang/include/clang/Parse/Parser.h | 10 + clang/include/clang/Sema/Sema.h | 8 + .../include/clang/Serialization/ASTBitCodes.h | 1 + clang/lib/AST/Stmt.cpp | 16 + clang/lib/AST/StmtPrinter.cpp | 5 + clang/lib/AST/StmtProfile.cpp | 2 + clang/lib/Basic/IdentifierTable.cpp | 2 + clang/lib/CodeGen/CGStmt.cpp | 85 +++ clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/lib/Driver/ToolChains/Clang.cpp | 4 + clang/lib/Frontend/InitPreprocessor.cpp | 5 + clang/lib/Headers/stddefer.h | 19 + clang/lib/Parse/ParseStmt.cpp | 26 + clang/lib/Sema/JumpDiagnostics.cpp | 27 +- clang/lib/Sema/SemaExceptionSpec.cpp | 1 + clang/lib/Sema/SemaExpr.cpp | 28 + clang/lib/Sema/SemaStmt.cpp | 45 +- clang/lib/Sema/TreeTransform.h | 8 + clang/lib/Serialization/ASTReaderStmt.cpp | 10 + clang/lib/Serialization/ASTWriterStmt.cpp | 7 + clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 + clang/test/AST/ast-dump-defer-ts.c | 27 + clang/test/AST/ast-print-defer-ts.c | 33 + clang/test/CodeGen/defer-ts-musttail.c | 7 + clang/test/CodeGen/defer-ts-nested-cleanups.c | 179 +++++ clang/test/CodeGen/defer-ts-seh.c | 44 ++ clang/test/CodeGen/defer-ts.c | 652 ++++++++++++++++++ clang/test/Lexer/defer-keyword.cpp | 5 + clang/test/Parser/defer-ts.c | 58 ++ clang/test/Parser/defer-ts.cpp | 5 + clang/test/Preprocessor/defer-ts.c | 9 + clang/test/Sema/defer-ts-seh.c | 17 + clang/test/Sema/defer-ts-sjlj.c | 52 ++ clang/test/Sema/defer-ts.c | 172 +++++ clang/tools/libclang/CXCursor.cpp | 5 + 46 files changed, 1667 insertions(+), 8 deletions(-) create mode 100644 clang/lib/Headers/stddefer.h create mode 100644 clang/test/AST/ast-dump-defer-ts.c create mode 100644 clang/test/AST/ast-print-defer-ts.c create mode 100644 clang/test/CodeGen/defer-ts-musttail.c create mode 100644 clang/test/CodeGen/defer-ts-nested-cleanups.c create mode 100644 clang/test/CodeGen/defer-ts-seh.c create mode 100644 clang/test/CodeGen/defer-ts.c create mode 100644 clang/test/Lexer/defer-keyword.cpp create mode 100644 clang/test/Parser/defer-ts.c create mode 100644 clang/test/Parser/defer-ts.cpp create mode 100644 clang/test/Preprocessor/defer-ts.c create mode 100644 clang/test/Sema/defer-ts-seh.c create mode 100644 clang/test/Sema/defer-ts-sjlj.c create mode 100644 clang/test/Sema/defer-ts.c diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp index 4f33670a8500a..6618341296aaf 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp @@ -237,6 +237,12 @@ static bool isIdenticalStmt(const ASTContext &Ctx, const Stmt *Stmt1, return false; return true; } + case Stmt::DeferStmtClass: { + const auto *DefStmt1 = cast(Stmt1); + const auto *DefStmt2 = cast(Stmt2); + return isIdenticalStmt(Ctx, DefStmt1->getBody(), DefStmt2->getBody(), + IgnoreSideEffects); + } case Stmt::CompoundStmtClass: { const auto *CompStmt1 = cast(Stmt1); const auto *CompStmt2 = cast(Stmt2); diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 005e858821804..899a4ee0dee0e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -208,6 +208,11 @@ Resolutions to C++ Defect Reports C Language Changes ------------------ +- Implemented the ``defer`` draft Technical Specification + (`WG14 N3734 `_); it is enabled in C mode by + passing ``-fdefer-ts``. Note, the details of this feature are subject to change given that the Technical + Specification is not yet ratified. + C2y Feature Support ^^^^^^^^^^^^^^^^^^^ - No longer triggering ``-Wstatic-in-inline`` in C2y mode; use of a static diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 8f427427d71ed..c3ac310bf5402 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2561,6 +2561,7 @@ DEF_TRAVERSE_STMT(DefaultStmt, {}) DEF_TRAVERSE_STMT(DoStmt, {}) DEF_TRAVERSE_STMT(ForStmt, {}) DEF_TRAVERSE_STMT(GotoStmt, {}) +DEF_TRAVERSE_STMT(DeferStmt, {}) DEF_TRAVERSE_STMT(IfStmt, {}) DEF_TRAVERSE_STMT(IndirectGotoStmt, {}) DEF_TRAVERSE_STMT(LabelStmt, {}) diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index e1cca34d2212c..d56de08eaf279 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -317,6 +317,16 @@ class alignas(void *) Stmt { SourceLocation KeywordLoc; }; + class DeferStmtBitfields { + friend class DeferStmt; + + LLVM_PREFERRED_TYPE(StmtBitfields) + unsigned : NumStmtBits; + + /// The location of the "defer". + SourceLocation DeferLoc; + }; + //===--- Expression bitfields classes ---===// class ExprBitfields { @@ -1318,6 +1328,7 @@ class alignas(void *) Stmt { LoopControlStmtBitfields LoopControlStmtBits; ReturnStmtBitfields ReturnStmtBits; SwitchCaseBitfields SwitchCaseBits; + DeferStmtBitfields DeferStmtBits; // Expressions ExprBitfields ExprBits; @@ -3211,6 +3222,47 @@ class ReturnStmt final } }; +/// DeferStmt - This represents a deferred statement. +class DeferStmt : public Stmt { + friend class ASTStmtReader; + + /// The deferred statement. + Stmt *Body; + + DeferStmt(EmptyShell Empty); + DeferStmt(SourceLocation DeferLoc, Stmt *Body); + +public: + static DeferStmt *CreateEmpty(ASTContext &Context, EmptyShell Empty); + static DeferStmt *Create(ASTContext &Context, SourceLocation DeferLoc, + Stmt *Body); + + SourceLocation getDeferLoc() const { return DeferStmtBits.DeferLoc; } + void setDeferLoc(SourceLocation DeferLoc) { + DeferStmtBits.DeferLoc = DeferLoc; + } + + Stmt *getBody() { return Body; } + const Stmt *getBody() const { return Body; } + void setBody(Stmt *S) { + assert(S && "defer body must not be null"); + Body = S; + } + + SourceLocation getBeginLoc() const { return getDeferLoc(); } + SourceLocation getEndLoc() const { return Body->getEndLoc(); } + + child_range children() { return child_range(&Body, &Body + 1); } + + const_child_range children() const { + return const_child_range(&Body, &Body + 1); + } + + static bool classof(const Stmt *S) { + return S->getStmtClass() == DeferStmtClass; + } +}; + /// AsmStmt is the base class for GCCAsmStmt and MSAsmStmt. class AsmStmt : public Stmt { protected: diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 9401377002223..442a90ec2472d 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -350,6 +350,8 @@ def err_address_of_label_outside_fn : Error< "use of address-of-label extension outside of a function body">; def err_asm_operand_wide_string_literal : Error< "cannot use %select{unicode|wide}0 string literal in 'asm'">; +def err_defer_ts_labeled_stmt : Error< + "substatement of defer must not be a label">; def err_asm_expected_string : Error< "expected string literal %select{or parenthesized constant expression |}0in 'asm'">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 28803829f387d..c79c208a07acd 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -6848,6 +6848,7 @@ def note_protected_by_objc_weak_init : Note< "jump bypasses initialization of __weak variable">; def note_protected_by_non_trivial_c_struct_init : Note< "jump bypasses initialization of variable of non-trivial C struct type">; +def note_protected_by_defer_stmt : Note<"jump bypasses defer statement">; def note_enters_block_captures_cxx_obj : Note< "jump enters lifetime of block which captures a destructible C++ object">; def note_enters_block_captures_strong : Note< @@ -6861,6 +6862,7 @@ def note_enters_compound_literal_scope : Note< "jump enters lifetime of a compound literal that is non-trivial to destruct">; def note_enters_statement_expression : Note< "jump enters a statement expression">; +def note_enters_defer_stmt : Note<"jump enters a defer statement">; def note_exits_cleanup : Note< "jump exits scope of variable with __attribute__((cleanup))">; @@ -6906,6 +6908,16 @@ def note_exits_block_captures_non_trivial_c_struct : Note< "to destroy">; def note_exits_compound_literal_scope : Note< "jump exits lifetime of a compound literal that is non-trivial to destruct">; +def note_exits_defer_stmt : Note<"jump exits a defer statement">; +def err_jump_out_of_defer_stmt : Error< + "cannot %enum_select{" + "%Break{break out of a}|" + "%Continue{continue loop outside of enclosing}|" + "%Return{return from a}|" + "%SEHLeave{__leave a}" + "}0 defer statement">; +def err_defer_invalid_sjlj : Error< + "cannot use %0 inside a defer statement">; def err_func_returning_qualified_void : ExtWarn< "function cannot return qualified void type %0">, @@ -11020,6 +11032,8 @@ def err_switch_explicit_conversion : Error< def err_switch_incomplete_class_type : Error< "switch condition has incomplete class type %0">; +// TODO: It ought to be possible to refactor these to be a single warning that +// uses %enum_select. def warn_empty_if_body : Warning< "if statement has empty body">, InGroup; def warn_empty_for_body : Warning< @@ -11030,6 +11044,8 @@ def warn_empty_while_body : Warning< "while loop has empty body">, InGroup; def warn_empty_switch_body : Warning< "switch statement has empty body">, InGroup; +def warn_empty_defer_body : Warning< + "defer statement has empty body">, InGroup; def note_empty_body_on_separate_line : Note< "put the semicolon on a separate line to silence this warning">; diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h index b27492d19a65b..043c184323876 100644 --- a/clang/include/clang/Basic/IdentifierTable.h +++ b/clang/include/clang/Basic/IdentifierTable.h @@ -77,7 +77,8 @@ enum TokenKey : unsigned { KEYNOZOS = 0x4000000, KEYHLSL = 0x8000000, KEYFIXEDPOINT = 0x10000000, - KEYMAX = KEYFIXEDPOINT, // The maximum key + KEYDEFERTS = 0x20000000, + KEYMAX = KEYDEFERTS, // The maximum key KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20, KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL & ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded. diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index e515c0cee79eb..093d2709e59f9 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -194,6 +194,7 @@ LANGOPT(NoSignedZero , 1, 0, Benign, "Permit Floating Point optimization wi LANGOPT(AllowRecip , 1, 0, Benign, "Permit Floating Point reciprocal") LANGOPT(ApproxFunc , 1, 0, Benign, "Permit Floating Point approximation") LANGOPT(NamedLoops , 1, 0, Benign, "Permit named break/continue") +LANGOPT(DeferTS , 1, 0, Benign, "C '_Defer' Technical Specification") ENUM_LANGOPT(ComplexRange, ComplexRangeKind, 3, CX_None, NotCompatible, "Enable use of range reduction for complex arithmetics.") diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index bf3686bb372d5..2d740425a3cb0 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -17,6 +17,7 @@ def ForStmt : StmtNode; def GotoStmt : StmtNode; def IndirectGotoStmt : StmtNode; def ReturnStmt : StmtNode; +def DeferStmt : StmtNode; def DeclStmt : StmtNode; def SwitchCase : StmtNode; def CaseStmt : StmtNode; diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index 564d6010181cc..8240d395d3e8f 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -293,6 +293,7 @@ PUNCTUATOR(greatergreatergreater, ">>>") // CHAR8SUPPORT - This is a keyword if 'char8_t' is a built-in type // KEYFIXEDPOINT - This is a keyword according to the N1169 fixed point // extension. +// KEYDEFERTS - This is a keyword if the C '_Defer' TS is enabled // KEYZOS - This is a keyword in C/C++ on z/OS // KEYWORD(auto , KEYALL) @@ -441,6 +442,9 @@ KEYWORD(_Float16 , KEYALL) C23_KEYWORD(typeof , KEYGNU) C23_KEYWORD(typeof_unqual , 0) +// '_Defer' TS +KEYWORD(_Defer , KEYDEFERTS) + // ISO/IEC JTC1 SC22 WG14 N1169 Extension KEYWORD(_Accum , KEYFIXEDPOINT) KEYWORD(_Fract , KEYFIXEDPOINT) diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index e55146f0c7823..e704d9e6275ec 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -1671,6 +1671,14 @@ defm named_loops PosFlag, NegFlag>; +// C '_Defer' TS +defm defer_ts : BoolFOption<"defer-ts", + LangOpts<"DeferTS">, DefaultFalse, + PosFlag, + NegFlag>, + ShouldParseIf; + // C++ Coroutines defm coroutines : BoolFOption<"coroutines", LangOpts<"Coroutines">, Default, diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 58eb1c0a7c114..47eedf216a44b 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -7500,6 +7500,16 @@ class Parser : public CodeCompletionHandler { StmtResult ParseBreakOrContinueStatement(bool IsContinue); + /// ParseDeferStatement + /// \verbatim + /// defer-statement: + /// '_Defer' deferred-block + /// + /// deferred-block: + /// unlabeled-statement + /// \endverbatim + StmtResult ParseDeferStatement(SourceLocation *TrailingElseLoc); + StmtResult ParsePragmaLoopHint(StmtVector &Stmts, ParsedStmtContext StmtCtx, SourceLocation *TrailingElseLoc, ParsedAttributes &Attrs, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index d14b5dc5ffaa4..97b6bb3d1b3a8 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10935,6 +10935,10 @@ class Sema final : public SemaBase { /// Stack of active SEH __finally scopes. Can be empty. SmallVector CurrentSEHFinally; + /// Stack of '_Defer' statements that are currently being parsed, as well + /// as the locations of their '_Defer' keywords. Can be empty. + SmallVector, 2> CurrentDefer; + StmtResult ActOnExprStmt(ExprResult Arg, bool DiscardedValue = true); StmtResult ActOnExprStmtError(); @@ -11081,6 +11085,10 @@ class Sema final : public SemaBase { StmtResult ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope, LabelDecl *Label, SourceLocation LabelLoc); + void ActOnStartOfDeferStmt(SourceLocation DeferLoc, Scope *CurScope); + void ActOnDeferStmtError(Scope *CurScope); + StmtResult ActOnEndOfDeferStmt(Stmt *Body, Scope *CurScope); + struct NamedReturnInfo { const VarDecl *Candidate; diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index d7d429eacd67a..b48f02c601889 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -2061,6 +2061,7 @@ enum StmtCode { // HLSL Constructs EXPR_HLSL_OUT_ARG, + STMT_DEFER, }; /// The kinds of designators that can occur in a diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp index 11ece494490de..10aacd75a650a 100644 --- a/clang/lib/AST/Stmt.cpp +++ b/clang/lib/AST/Stmt.cpp @@ -1499,3 +1499,19 @@ const Stmt *LoopControlStmt::getNamedLoopOrSwitch() const { return nullptr; return getLabelDecl()->getStmt()->getInnermostLabeledStmt(); } + +DeferStmt::DeferStmt(EmptyShell Empty) : Stmt(DeferStmtClass, Empty) {} +DeferStmt::DeferStmt(SourceLocation DeferLoc, Stmt *Body) + : Stmt(DeferStmtClass) { + setDeferLoc(DeferLoc); + setBody(Body); +} + +DeferStmt *DeferStmt::CreateEmpty(ASTContext &Context, EmptyShell Empty) { + return new (Context) DeferStmt(Empty); +} + +DeferStmt *DeferStmt::Create(ASTContext &Context, SourceLocation DeferLoc, + Stmt *Body) { + return new (Context) DeferStmt(DeferLoc, Body); +} diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index ff8ca01ec5477..9bc5ee0c7f40e 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -491,6 +491,11 @@ void StmtPrinter::VisitBreakStmt(BreakStmt *Node) { if (Policy.IncludeNewlines) OS << NL; } +void StmtPrinter::VisitDeferStmt(DeferStmt *Node) { + Indent() << "_Defer"; + PrintControlledStmt(Node->getBody()); +} + void StmtPrinter::VisitReturnStmt(ReturnStmt *Node) { Indent() << "return"; if (Node->getRetValue()) { diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 4a8c638c85331..b6395a17547f7 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -323,6 +323,8 @@ void StmtProfiler::VisitReturnStmt(const ReturnStmt *S) { VisitStmt(S); } +void StmtProfiler::VisitDeferStmt(const DeferStmt *S) { VisitStmt(S); } + void StmtProfiler::VisitGCCAsmStmt(const GCCAsmStmt *S) { VisitStmt(S); ID.AddBoolean(S->isVolatile()); diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp index d1c959b9687c4..9b4019834c4be 100644 --- a/clang/lib/Basic/IdentifierTable.cpp +++ b/clang/lib/Basic/IdentifierTable.cpp @@ -164,6 +164,8 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts, return KS_Unknown; case KEYFIXEDPOINT: return LangOpts.FixedPoint ? KS_Enabled : KS_Disabled; + case KEYDEFERTS: + return LangOpts.DeferTS ? KS_Enabled : KS_Disabled; default: llvm_unreachable("Unknown KeywordStatus flag"); } diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 36be3295950b8..c050fd41ac0e9 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -114,6 +114,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { case Stmt::ContinueStmtClass: case Stmt::DefaultStmtClass: case Stmt::CaseStmtClass: + case Stmt::DeferStmtClass: case Stmt::SEHLeaveStmtClass: case Stmt::SYCLKernelCallStmtClass: llvm_unreachable("should have emitted these statements as simple"); @@ -539,6 +540,9 @@ bool CodeGenFunction::EmitSimpleStmt(const Stmt *S, case Stmt::CaseStmtClass: EmitCaseStmt(cast(*S), Attrs); break; + case Stmt::DeferStmtClass: + EmitDeferStmt(cast(*S)); + break; case Stmt::SEHLeaveStmtClass: EmitSEHLeaveStmt(cast(*S)); break; @@ -2000,6 +2004,87 @@ void CodeGenFunction::EmitDefaultStmt(const DefaultStmt &S, EmitStmt(S.getSubStmt()); } +namespace { +struct EmitDeferredStatement final : EHScopeStack::Cleanup { + const DeferStmt &Stmt; + EmitDeferredStatement(const DeferStmt *Stmt) : Stmt(*Stmt) {} + + void Emit(CodeGenFunction &CGF, Flags) override { + // Take care that any cleanups pushed by the body of a '_Defer' statement + // don't clobber the current cleanup slot value. + // + // Assume we have a scope that pushes a cleanup; when that scope is exited, + // we need to run that cleanup; this is accomplished by emitting the cleanup + // into a separate block and then branching to that block at scope exit. + // + // Where this gets complicated is if we exit the scope in multiple different + // ways; e.g. in a 'for' loop, we may exit the scope of its body by falling + // off the end (in which case we need to run the cleanup and then branch to + // the increment), or by 'break'ing out of the loop (in which case we need + // to run the cleanup and then branch to the loop exit block); in both cases + // we first branch to the cleanup block to run the cleanup, but the block we + // need to jump to *after* running the cleanup is different. + // + // This is accomplished using a local integer variable called the 'cleanup + // slot': before branching to the cleanup block, we store a value into that + // slot. Then, in the cleanup block, after running the cleanup, we load the + // value of that variable and 'switch' on it to branch to the appropriate + // continuation block. + // + // The problem that arises once '_Defer' statements are involved is that the + // body of a '_Defer' is an arbitrary statement which itself can create more + // cleanups. This means we may end up overwriting the cleanup slot before we + // ever have a chance to 'switch' on it, which means that once we *do* get + // to the 'switch', we end up in whatever block the cleanup code happened to + // pick as the default 'switch' exit label! + // + // That is, what is normally supposed to happen is something like: + // + // 1. Store 'X' to cleanup slot. + // 2. Branch to cleanup block. + // 3. Execute cleanup. + // 4. Read value from cleanup slot. + // 5. Branch to the block associated with 'X'. + // + // But if we encounter a _Defer' statement that contains a cleanup, then + // what might instead happen is: + // + // 1. Store 'X' to cleanup slot. + // 2. Branch to cleanup block. + // 3. Execute cleanup; this ends up pushing another cleanup, so: + // 3a. Store 'Y' to cleanup slot. + // 3b. Run steps 2–5 recursively. + // 4. Read value from cleanup slot, which is now 'Y' instead of 'X'. + // 5. Branch to the block associated with 'Y'... which doesn't even + // exist because the value 'Y' is only meaningful for the inner + // cleanup. The result is we just branch 'somewhere random'. + // + // The rest of the cleanup code simply isn't prepared to handle this case + // because most other cleanups can't push more cleanups, and thus, emitting + // other cleanups generally cannot clobber the cleanup slot. + // + // To prevent this from happening, save the current cleanup slot value and + // restore it after emitting the '_Defer' statement. + llvm::Value *SavedCleanupDest = nullptr; + if (CGF.NormalCleanupDest.isValid()) + SavedCleanupDest = + CGF.Builder.CreateLoad(CGF.NormalCleanupDest, "cleanup.dest.saved"); + + CGF.EmitStmt(Stmt.getBody()); + + if (SavedCleanupDest && CGF.HaveInsertPoint()) + CGF.Builder.CreateStore(SavedCleanupDest, CGF.NormalCleanupDest); + + // Cleanups must end with an insert point. + CGF.EnsureInsertPoint(); + } +}; +} // namespace + +void CodeGenFunction::EmitDeferStmt(const DeferStmt &S) { + EHStack.pushCleanup(NormalAndEHCleanup, &S); +} + /// CollectStatementsForCase - Given the body of a 'switch' statement and a /// constant value that is being switched on, see if we can dead code eliminate /// the body of the switch to a simple series of statements to emit. Basically, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 664ee1547ccf1..10238ffd3971c 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3622,6 +3622,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitDefaultStmt(const DefaultStmt &S, ArrayRef Attrs); void EmitCaseStmt(const CaseStmt &S, ArrayRef Attrs); void EmitCaseStmtRange(const CaseStmt &S, ArrayRef Attrs); + void EmitDeferStmt(const DeferStmt &S); void EmitAsmStmt(const AsmStmt &S); const BreakContinue *GetDestForLoopControlStmt(const LoopControlStmt &S); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 542b70b3e9d4c..7119614634552 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7006,6 +7006,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, types::isCXX(InputType)) CmdArgs.push_back("-fcoro-aligned-allocation"); + if (Args.hasFlag(options::OPT_fdefer_ts, options::OPT_fno_defer_ts, + /*Default=*/false)) + CmdArgs.push_back("-fdefer-ts"); + Args.AddLastArg(CmdArgs, options::OPT_fdouble_square_bracket_attributes, options::OPT_fno_double_square_bracket_attributes); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index fd464d68b5b42..8253fad9e5503 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -498,6 +498,11 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__STDC_EMBED_EMPTY__", llvm::itostr(static_cast(EmbedResult::Empty))); + // We define this to '1' here to indicate that we only support '_Defer' + // as a keyword. + if (LangOpts.DeferTS) + Builder.defineMacro("__STDC_DEFER_TS25755__", "1"); + if (LangOpts.ObjC) Builder.defineMacro("__OBJC__"); diff --git a/clang/lib/Headers/stddefer.h b/clang/lib/Headers/stddefer.h new file mode 100644 index 0000000000000..162876ddfa395 --- /dev/null +++ b/clang/lib/Headers/stddefer.h @@ -0,0 +1,19 @@ +/*===---- stddefer.h - Standard header for 'defer' -------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_STDDEFER_H +#define __CLANG_STDDEFER_H + +/* Provide 'defer' if '_Defer' is supported. */ +#ifdef __STDC_DEFER_TS25755__ +#define __STDC_VERSION_STDDEFER_H__ 202602L +#define defer _Defer +#endif + +#endif /* __CLANG_STDDEFER_H */ diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 7e73d89c2a18c..78ce4b76d29ae 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -28,6 +28,7 @@ #include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/TypoCorrection.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include using namespace clang; @@ -312,6 +313,8 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes( Res = ParseReturnStatement(); SemiError = "co_return"; break; + case tok::kw__Defer: // C defer TS: defer-statement + return ParseDeferStatement(TrailingElseLoc); case tok::kw_asm: { for (const ParsedAttr &AL : CXX11Attrs) @@ -2370,6 +2373,29 @@ StmtResult Parser::ParseReturnStatement() { return Actions.ActOnReturnStmt(ReturnLoc, R.get(), getCurScope()); } +StmtResult Parser::ParseDeferStatement(SourceLocation *TrailingElseLoc) { + assert(Tok.is(tok::kw__Defer)); + SourceLocation DeferLoc = ConsumeToken(); + + Actions.ActOnStartOfDeferStmt(DeferLoc, getCurScope()); + + auto OnError = llvm::make_scope_exit( + [&] { Actions.ActOnDeferStmtError(getCurScope()); }); + + StmtResult Res = ParseStatement(TrailingElseLoc); + if (!Res.isUsable()) + return StmtError(); + + // The grammar specifically calls for an unlabeled-statement here. + if (auto *L = dyn_cast(Res.get())) { + Diag(L->getIdentLoc(), diag::err_defer_ts_labeled_stmt); + return StmtError(); + } + + OnError.release(); + return Actions.ActOnEndOfDeferStmt(Res.get(), getCurScope()); +} + StmtResult Parser::ParsePragmaLoopHint(StmtVector &Stmts, ParsedStmtContext StmtCtx, SourceLocation *TrailingElseLoc, diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp index 36704c3826dfd..36c9d9afb37f1 100644 --- a/clang/lib/Sema/JumpDiagnostics.cpp +++ b/clang/lib/Sema/JumpDiagnostics.cpp @@ -590,6 +590,27 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, break; } + case Stmt::DeferStmtClass: { + auto *D = cast(S); + + { + // Disallow jumps over defer statements. + unsigned NewParentScope = Scopes.size(); + Scopes.emplace_back(ParentScope, diag::note_protected_by_defer_stmt, 0, + D->getDeferLoc()); + origParentScope = NewParentScope; + } + + // Disallow jumps into or out of defer statements. + { + unsigned NewParentScope = Scopes.size(); + Scopes.emplace_back(ParentScope, diag::note_enters_defer_stmt, + diag::note_exits_defer_stmt, D->getDeferLoc()); + BuildScopeInformation(D->getBody(), NewParentScope); + } + return; + } + case Stmt::CaseStmtClass: case Stmt::DefaultStmtClass: case Stmt::LabelStmtClass: @@ -972,7 +993,7 @@ void JumpScopeChecker::CheckJump(Stmt *From, Stmt *To, SourceLocation DiagLoc, // Common case: exactly the same scope, which is fine. if (FromScope == ToScope) return; - // Warn on gotos out of __finally blocks. + // Warn on gotos out of __finally blocks and defer statements. if (isa(From) || isa(From)) { // If FromScope > ToScope, FromScope is more nested and the jump goes to a // less nested scope. Check if it crosses a __finally along the way. @@ -990,6 +1011,10 @@ void JumpScopeChecker::CheckJump(Stmt *From, Stmt *To, SourceLocation DiagLoc, S.Diag(From->getBeginLoc(), diag::err_goto_into_protected_scope); S.Diag(Scopes[I].Loc, diag::note_acc_branch_out_of_compute_construct); return; + } else if (Scopes[I].OutDiag == diag::note_exits_defer_stmt) { + S.Diag(From->getBeginLoc(), diag::err_goto_into_protected_scope); + S.Diag(Scopes[I].Loc, diag::note_exits_defer_stmt); + return; } } } diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index a0483c3027199..b5ff1dbd26d68 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1538,6 +1538,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Stmt::SEHTryStmtClass: case Stmt::SwitchStmtClass: case Stmt::WhileStmtClass: + case Stmt::DeferStmtClass: return canSubStmtsThrow(*this, S); case Stmt::DeclStmtClass: { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index a8c2e39b49923..5836587a6ffa5 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6860,6 +6860,34 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl, FunctionDecl *FDecl = dyn_cast_or_null(NDecl); unsigned BuiltinID = (FDecl ? FDecl->getBuiltinID() : 0); + auto IsSJLJ = [&] { + switch (BuiltinID) { + case Builtin::BI__builtin_longjmp: + case Builtin::BI__builtin_setjmp: + case Builtin::BI__sigsetjmp: + case Builtin::BI_longjmp: + case Builtin::BI_setjmp: + case Builtin::BIlongjmp: + case Builtin::BIsetjmp: + case Builtin::BIsiglongjmp: + case Builtin::BIsigsetjmp: + return true; + default: + return false; + } + }; + + // Forbid any call to setjmp/longjmp and friends inside a '_Defer' statement. + if (!CurrentDefer.empty() && IsSJLJ()) { + // Note: If we ever start supporting '_Defer' in C++ we'll have to check + // for more than just blocks (e.g. lambdas, nested classes...). + Scope *DeferParent = CurrentDefer.back().first; + Scope *Block = CurScope->getBlockParent(); + if (DeferParent->Contains(*CurScope) && + (!Block || !DeferParent->Contains(*Block))) + Diag(Fn->getExprLoc(), diag::err_defer_invalid_sjlj) << FDecl; + } + // Functions with 'interrupt' attribute cannot be called directly. if (FDecl) { if (FDecl->hasAttr()) { diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 6bb1a27d1800c..1b1643250d05e 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3267,12 +3267,23 @@ Sema::ActOnIndirectGotoStmt(SourceLocation GotoLoc, SourceLocation StarLoc, return new (Context) IndirectGotoStmt(GotoLoc, StarLoc, E); } -static void CheckJumpOutOfSEHFinally(Sema &S, SourceLocation Loc, - const Scope &DestScope) { +static void CheckJumpOutOfSEHFinallyOrDefer(Sema &S, SourceLocation Loc, + const Scope &DestScope, + unsigned DeferJumpKind) { if (!S.CurrentSEHFinally.empty() && DestScope.Contains(*S.CurrentSEHFinally.back())) { S.Diag(Loc, diag::warn_jump_out_of_seh_finally); } + + if (!S.CurrentDefer.empty()) { + Scope *Parent = S.CurrentDefer.back().first; + assert(Parent); + + // Note: We don't create a new scope for defer statements, so 'Parent' + // is actually the scope that contains the '_Defer'. + if (DestScope.Contains(*Parent) || &DestScope == Parent) + S.Diag(Loc, diag::err_jump_out_of_defer_stmt) << DeferJumpKind; + } } static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope, @@ -3346,7 +3357,8 @@ StmtResult Sema::ActOnContinueStmt(SourceLocation ContinueLoc, Scope *CurScope, Diag(ContinueLoc, diag::err_acc_branch_in_out_compute_construct) << /*branch*/ 0 << /*out of */ 0); - CheckJumpOutOfSEHFinally(*this, ContinueLoc, *S); + CheckJumpOutOfSEHFinallyOrDefer(*this, ContinueLoc, *S, + diag::DeferJumpKind::Continue); return new (Context) ContinueStmt(ContinueLoc, LabelLoc, Target); } @@ -3387,7 +3399,8 @@ StmtResult Sema::ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope, Diag(BreakLoc, diag::err_acc_branch_in_out_compute_construct) << /*branch*/ 0 << /*out of */ 0); - CheckJumpOutOfSEHFinally(*this, BreakLoc, *S); + CheckJumpOutOfSEHFinallyOrDefer(*this, BreakLoc, *S, + diag::DeferJumpKind::Break); return new (Context) BreakStmt(BreakLoc, LabelLoc, Target); } @@ -3932,11 +3945,30 @@ Sema::ActOnReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp, CurScope->updateNRVOCandidate(VD); - CheckJumpOutOfSEHFinally(*this, ReturnLoc, *CurScope->getFnParent()); + CheckJumpOutOfSEHFinallyOrDefer(*this, ReturnLoc, *CurScope->getFnParent(), + diag::DeferJumpKind::Return); return R; } +void Sema::ActOnStartOfDeferStmt(SourceLocation DeferLoc, Scope *CurScope) { + CurrentDefer.emplace_back(CurScope, DeferLoc); +} + +void Sema::ActOnDeferStmtError([[maybe_unused]] Scope *CurScope) { + assert(!CurrentDefer.empty() && CurrentDefer.back().first == CurScope); + CurrentDefer.pop_back(); +} + +StmtResult Sema::ActOnEndOfDeferStmt(Stmt *Body, + [[maybe_unused]] Scope *CurScope) { + assert(!CurrentDefer.empty() && CurrentDefer.back().first == CurScope); + SourceLocation DeferLoc = CurrentDefer.pop_back_val().second; + DiagnoseEmptyStmtBody(DeferLoc, Body, diag::warn_empty_defer_body); + setFunctionHasBranchProtectedScope(); + return DeferStmt::Create(Context, DeferLoc, Body); +} + static bool CheckSimplerImplicitMovesMSVCWorkaround(const Sema &S, const Expr *E) { if (!E || !S.getLangOpts().CPlusPlus23 || !S.getLangOpts().MSVCCompat) @@ -4554,7 +4586,8 @@ Sema::ActOnSEHLeaveStmt(SourceLocation Loc, Scope *CurScope) { SEHTryParent = SEHTryParent->getParent(); if (!SEHTryParent) return StmtError(Diag(Loc, diag::err_ms___leave_not_in___try)); - CheckJumpOutOfSEHFinally(*this, Loc, *SEHTryParent); + CheckJumpOutOfSEHFinallyOrDefer(*this, Loc, *SEHTryParent, + diag::DeferJumpKind::SEHLeave); return new (Context) SEHLeaveStmt(Loc); } diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 8e5dbeb792348..d5b6fdd7dc405 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -8552,6 +8552,14 @@ TreeTransform::TransformBreakStmt(BreakStmt *S) { BreakStmt(S->getKwLoc(), S->getLabelLoc(), cast(LD)); } +template +StmtResult TreeTransform::TransformDeferStmt(DeferStmt *S) { + StmtResult Result = getDerived().TransformStmt(S->getBody()); + if (!Result.isUsable()) + return StmtError(); + return DeferStmt::Create(getSema().Context, S->getDeferLoc(), Result.get()); +} + template StmtResult TreeTransform::TransformReturnStmt(ReturnStmt *S) { diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index eef97a8588f0b..495517ccb31f3 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -335,6 +335,12 @@ void ASTStmtReader::VisitContinueStmt(ContinueStmt *S) { void ASTStmtReader::VisitBreakStmt(BreakStmt *S) { VisitLoopControlStmt(S); } +void ASTStmtReader::VisitDeferStmt(DeferStmt *S) { + VisitStmt(S); + S->setDeferLoc(readSourceLocation()); + S->setBody(Record.readSubStmt()); +} + void ASTStmtReader::VisitReturnStmt(ReturnStmt *S) { VisitStmt(S); @@ -3146,6 +3152,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { S = new (Context) BreakStmt(Empty); break; + case STMT_DEFER: + S = DeferStmt::CreateEmpty(Context, Empty); + break; + case STMT_RETURN: S = ReturnStmt::CreateEmpty( Context, /* HasNRVOCandidate=*/Record[ASTStmtReader::NumStmtFields]); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index acf345392aa1a..a457e627799c9 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -330,6 +330,13 @@ void ASTStmtWriter::VisitBreakStmt(BreakStmt *S) { Code = serialization::STMT_BREAK; } +void ASTStmtWriter::VisitDeferStmt(DeferStmt *S) { + VisitStmt(S); + Record.AddSourceLocation(S->getDeferLoc()); + Record.AddStmt(S->getBody()); + Code = serialization::STMT_DEFER; +} + void ASTStmtWriter::VisitReturnStmt(ReturnStmt *S) { VisitStmt(S); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index a759aee47b8ea..d3de632179e1d 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1874,6 +1874,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::NullStmtClass: case Stmt::SwitchStmtClass: case Stmt::WhileStmtClass: + case Stmt::DeferStmtClass: case Expr::MSDependentExistsStmtClass: llvm_unreachable("Stmt should not be in analyzer evaluation loop"); case Stmt::ImplicitValueInitExprClass: diff --git a/clang/test/AST/ast-dump-defer-ts.c b/clang/test/AST/ast-dump-defer-ts.c new file mode 100644 index 0000000000000..eba057f93c9c2 --- /dev/null +++ b/clang/test/AST/ast-dump-defer-ts.c @@ -0,0 +1,27 @@ +// Test without serialization: +// RUN: %clang_cc1 -std=c23 -fdefer-ts -ast-dump %s -triple x86_64-linux-gnu \ +// RUN: | FileCheck %s +// +// Test with serialization: +// RUN: %clang_cc1 -std=c23 -fdefer-ts -triple x86_64-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -std=c23 -fdefer-ts -triple x86_64-linux-gnu -include-pch %t -ast-dump-all /dev/null \ +// RUN: | FileCheck %s + +static inline void f() { + _Defer 3; + _Defer { 4; } + _Defer _Defer if (true) {} +} + +// CHECK-LABEL: f 'void (void)' static inline +// CHECK-NEXT: `-CompoundStmt {{.*}} +// CHECK-NEXT: |-DeferStmt {{.*}} +// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 3 +// CHECK-NEXT: |-DeferStmt {{.*}} +// CHECK-NEXT: | `-CompoundStmt {{.*}} +// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 4 +// CHECK-NEXT: `-DeferStmt {{.*}} +// CHECK-NEXT: `-DeferStmt {{.*}} +// CHECK-NEXT: `-IfStmt {{.*}} +// CHECK-NEXT: |-CXXBoolLiteralExpr {{.*}} 'bool' true +// CHECK-NEXT: `-CompoundStmt {{.*}} diff --git a/clang/test/AST/ast-print-defer-ts.c b/clang/test/AST/ast-print-defer-ts.c new file mode 100644 index 0000000000000..bcc217a597778 --- /dev/null +++ b/clang/test/AST/ast-print-defer-ts.c @@ -0,0 +1,33 @@ +// RUN: %clang_cc1 -std=c23 -fdefer-ts -ast-print %s | FileCheck %s + +void g(); + +// CHECK: void f +void f() { + // CHECK-NEXT: _Defer + // CHECK-NEXT: g(); + // CHECK-NEXT: _Defer + // CHECK-NEXT: _Defer + // CHECK-NEXT: g(); + // CHECK-NEXT: _Defer { + // CHECK-NEXT: } + // CHECK-NEXT: _Defer { + // CHECK-NEXT: int x; + // CHECK-NEXT: } + // CHECK-NEXT: _Defer + // CHECK-NEXT: if (1) { + // CHECK-NEXT: } + _Defer + g(); + _Defer + _Defer + g(); + _Defer { + } + _Defer { + int x; + } + _Defer + if (1) { + } +} diff --git a/clang/test/CodeGen/defer-ts-musttail.c b/clang/test/CodeGen/defer-ts-musttail.c new file mode 100644 index 0000000000000..5622fecbb4fed --- /dev/null +++ b/clang/test/CodeGen/defer-ts-musttail.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c23 -fdefer-ts -emit-llvm %s -o /dev/null -verify + +int bar() { return 12; } +int foo() { + _Defer {}; + [[clang::musttail]] return bar(); // expected-error {{cannot compile this tail call skipping over cleanups yet}} +} diff --git a/clang/test/CodeGen/defer-ts-nested-cleanups.c b/clang/test/CodeGen/defer-ts-nested-cleanups.c new file mode 100644 index 0000000000000..d831b4380b929 --- /dev/null +++ b/clang/test/CodeGen/defer-ts-nested-cleanups.c @@ -0,0 +1,179 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c23 -fdefer-ts -emit-llvm %s -o - -O1 -disable-llvm-passes | FileCheck %s + +// Test that cleanups emitted in a '_Defer' don't clobber the cleanup slot; we +// test this using lifetime intrinsics, which are emitted starting at -O1. + +void g(); + +// CHECK-LABEL: define {{.*}} void @f1() +// CHECK: entry: +// CHECK-NEXT: %i = alloca i32, align 4 +// CHECK-NEXT: %cleanup.dest.slot = alloca i32, align 4 +// CHECK-NEXT: %j = alloca i32, align 4 +// CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr %i) +// CHECK-NEXT: store i32 0, ptr %i, align 4 +// CHECK-NEXT: br label %for.cond +// CHECK: for.cond: +// CHECK-NEXT: %0 = load i32, ptr %i, align 4 +// CHECK-NEXT: %cmp = icmp eq i32 %0, 1 +// CHECK-NEXT: br i1 %cmp, label %if.then, label %if.end +// CHECK: if.then: +// CHECK-NEXT: store i32 2, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: cleanup: +// CHECK-NEXT: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr %j) +// CHECK-NEXT: store i32 0, ptr %j, align 4 +// CHECK-NEXT: br label %for.cond1 +// CHECK: for.cond1: +// CHECK-NEXT: %1 = load i32, ptr %j, align 4 +// CHECK-NEXT: %cmp2 = icmp ne i32 %1, 1 +// CHECK-NEXT: br i1 %cmp2, label %for.body, label %for.cond.cleanup +// CHECK: for.cond.cleanup: +// CHECK-NEXT: store i32 5, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr %j) +// CHECK-NEXT: br label %for.end +// CHECK: for.body: +// CHECK-NEXT: call void @g() +// CHECK-NEXT: br label %for.inc +// CHECK: for.inc: +// CHECK-NEXT: %2 = load i32, ptr %j, align 4 +// CHECK-NEXT: %inc = add nsw i32 %2, 1 +// CHECK-NEXT: store i32 %inc, ptr %j, align 4 +// CHECK-NEXT: br label %for.cond1 +// CHECK: for.end: +// CHECK-NEXT: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest, label %cleanup6 [ +// CHECK-NEXT: i32 0, label %cleanup.cont +// CHECK-NEXT: ] +// CHECK: cleanup.cont: +// CHECK-NEXT: br label %for.inc4 +// CHECK: for.inc4: +// CHECK-NEXT: %3 = load i32, ptr %i, align 4 +// CHECK-NEXT: %inc5 = add nsw i32 %3, 1 +// CHECK-NEXT: store i32 %inc5, ptr %i, align 4 +// CHECK-NEXT: br label %for.cond +// CHECK: cleanup6: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr %i) +// CHECK-NEXT: br label %for.end7 +// CHECK: for.end7: +// CHECK-NEXT: ret void +void f1() { + for (int i = 0;; i++) { + _Defer { + for (int j = 0; j != 1; j++) { + g(); + } + } + if (i == 1) break; + } +} + +// CHECK-LABEL: define {{.*}} void @f2() +// CHECK: entry: +// CHECK-NEXT: %i = alloca i32, align 4 +// CHECK-NEXT: %cleanup.dest.slot = alloca i32, align 4 +// CHECK-NEXT: %j = alloca i32, align 4 +// CHECK-NEXT: %k = alloca i32, align 4 +// CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr %i) +// CHECK-NEXT: store i32 0, ptr %i, align 4 +// CHECK-NEXT: br label %for.cond +// CHECK: for.cond: +// CHECK-NEXT: %0 = load i32, ptr %i, align 4 +// CHECK-NEXT: %cmp = icmp eq i32 %0, 1 +// CHECK-NEXT: br i1 %cmp, label %if.then, label %if.end +// CHECK: if.then: +// CHECK-NEXT: store i32 2, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: cleanup: +// CHECK-NEXT: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr %j) +// CHECK-NEXT: store i32 0, ptr %j, align 4 +// CHECK-NEXT: br label %for.cond1 +// CHECK: for.cond1: +// CHECK-NEXT: %1 = load i32, ptr %j, align 4 +// CHECK-NEXT: %cmp2 = icmp eq i32 %1, 1 +// CHECK-NEXT: br i1 %cmp2, label %if.then3, label %if.end4 +// CHECK: if.then3: +// CHECK-NEXT: store i32 5, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup5 +// CHECK: if.end4: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup5 +// CHECK: cleanup5: +// CHECK-NEXT: %cleanup.dest.saved6 = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr %k) +// CHECK-NEXT: store i32 0, ptr %k, align 4 +// CHECK-NEXT: br label %for.cond7 +// CHECK: for.cond7: +// CHECK-NEXT: %2 = load i32, ptr %k, align 4 +// CHECK-NEXT: %cmp8 = icmp ne i32 %2, 1 +// CHECK-NEXT: br i1 %cmp8, label %for.body, label %for.cond.cleanup +// CHECK: for.cond.cleanup: +// CHECK-NEXT: store i32 8, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr %k) +// CHECK-NEXT: br label %for.end +// CHECK: for.body: +// CHECK-NEXT: call void @g() +// CHECK-NEXT: br label %for.inc +// CHECK: for.inc: +// CHECK-NEXT: %3 = load i32, ptr %k, align 4 +// CHECK-NEXT: %inc = add nsw i32 %3, 1 +// CHECK-NEXT: store i32 %inc, ptr %k, align 4 +// CHECK-NEXT: br label %for.cond7 +// CHECK: for.end: +// CHECK-NEXT: store i32 %cleanup.dest.saved6, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest, label %cleanup12 [ +// CHECK-NEXT: i32 0, label %cleanup.cont +// CHECK-NEXT: ] +// CHECK: cleanup.cont: +// CHECK-NEXT: br label %for.inc10 +// CHECK: for.inc10: +// CHECK-NEXT: %4 = load i32, ptr %j, align 4 +// CHECK-NEXT: %inc11 = add nsw i32 %4, 1 +// CHECK-NEXT: store i32 %inc11, ptr %j, align 4 +// CHECK-NEXT: br label %for.cond1 +// CHECK: cleanup12: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr %j) +// CHECK-NEXT: br label %for.end13 +// CHECK: for.end13: +// CHECK-NEXT: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %cleanup.dest14 = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest14, label %cleanup18 [ +// CHECK-NEXT: i32 0, label %cleanup.cont15 +// CHECK-NEXT: ] +// CHECK: cleanup.cont15: +// CHECK-NEXT: br label %for.inc16 +// CHECK: for.inc16: +// CHECK-NEXT: %5 = load i32, ptr %i, align 4 +// CHECK-NEXT: %inc17 = add nsw i32 %5, 1 +// CHECK-NEXT: store i32 %inc17, ptr %i, align 4 +// CHECK-NEXT: br label %for.cond +// CHECK: cleanup18: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr %i) +// CHECK-NEXT: br label %for.end19 +// CHECK: for.end19: +// CHECK-NEXT: ret void +void f2() { + for (int i = 0;; i++) { + _Defer { + for (int j = 0;; j++) { + _Defer { + for (int k = 0; k != 1; k++) { + g(); + } + } + if (j == 1) break; + } + } + if (i == 1) break; + } +} diff --git a/clang/test/CodeGen/defer-ts-seh.c b/clang/test/CodeGen/defer-ts-seh.c new file mode 100644 index 0000000000000..a91816f50d8d5 --- /dev/null +++ b/clang/test/CodeGen/defer-ts-seh.c @@ -0,0 +1,44 @@ +// RUN: %clang_cc1 -triple x86_64-windows-msvc -std=c23 -fdefer-ts -fms-compatibility -emit-llvm %s -o - | FileCheck %s + +void g(); +void h(); + +void f() { + __try { + _Defer h(); + g(); + } __finally { + + } +} + +// CHECK-LABEL: define {{.*}} void @f() {{.*}} personality ptr @__C_specific_handler +// CHECK: entry: +// CHECK: invoke void @g() #4 +// CHECK: to label %invoke.cont unwind label %ehcleanup +// CHECK: invoke.cont: +// CHECK: invoke void @h() #4 +// CHECK: to label %invoke.cont1 unwind label %ehcleanup3 +// CHECK: invoke.cont1: +// CHECK: %0 = call ptr @llvm.localaddress() +// CHECK: call void @"?fin$0@0@f@@"(i8 {{.*}} 0, ptr {{.*}} %0) +// CHECK: ret void +// CHECK: ehcleanup: +// CHECK: %1 = cleanuppad within none [] +// CHECK: invoke void @h() #4 [ "funclet"(token %1) ] +// CHECK: to label %invoke.cont2 unwind label %ehcleanup3 +// CHECK: invoke.cont2: +// CHECK: cleanupret from %1 unwind label %ehcleanup3 +// CHECK: ehcleanup3: +// CHECK: %2 = cleanuppad within none [] +// CHECK: %3 = call ptr @llvm.localaddress() +// CHECK: call void @"?fin$0@0@f@@"(i8 {{.*}} 1, ptr {{.*}} %3) [ "funclet"(token %2) ] +// CHECK: cleanupret from %2 unwind to caller + +// CHECK-LABEL: define {{.*}} void @"?fin$0@0@f@@"(i8 {{.*}} %abnormal_termination, ptr {{.*}} %frame_pointer) +// CHECK: entry: +// CHECK: %frame_pointer.addr = alloca ptr, align 8 +// CHECK: %abnormal_termination.addr = alloca i8, align 1 +// CHECK: store ptr %frame_pointer, ptr %frame_pointer.addr, align 8 +// CHECK: store i8 %abnormal_termination, ptr %abnormal_termination.addr, align 1 +// CHECK: ret void diff --git a/clang/test/CodeGen/defer-ts.c b/clang/test/CodeGen/defer-ts.c new file mode 100644 index 0000000000000..79b09064d330c --- /dev/null +++ b/clang/test/CodeGen/defer-ts.c @@ -0,0 +1,652 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux -std=c23 -fdefer-ts -emit-llvm %s -o - | FileCheck %s + +#define defer _Defer + +void a(); +void b(); +void c(); +void x(int q); +bool q(int q); +[[noreturn]] void noreturn(); + +// CHECK-LABEL: define {{.*}} void @f1() +void f1() { + // CHECK: call void @c() + // CHECK: call void @b() + // CHECK: call void @a() + defer a(); + defer b(); + defer c(); +} + +// CHECK-LABEL: define {{.*}} void @f2() +void f2() { + // CHECK: call void @x(i32 {{.*}} 1) + // CHECK: call void @x(i32 {{.*}} 2) + // CHECK: call void @x(i32 {{.*}} 3) + // CHECK: call void @x(i32 {{.*}} 4) + // CHECK: call void @x(i32 {{.*}} 5) + defer x(5); + { + defer x(4); + { + defer x(2); + defer x(1); + } + x(3); + } +} + +// CHECK-LABEL: define {{.*}} void @f3(i1 {{.*}} %ret) +void f3(bool ret) { + // CHECK: entry: + // CHECK: %ret.addr = alloca i8, align 1 + // CHECK: %cleanup.dest.slot = alloca i32, align 4 + // CHECK: %storedv = zext i1 %ret to i8 + // CHECK: store i8 %storedv, ptr %ret.addr, align 1 + // CHECK: %0 = load i8, ptr %ret.addr, align 1 + // CHECK: %loadedv = trunc i8 %0 to i1 + // CHECK: br i1 %loadedv, label %if.then, label %if.end + // CHECK: if.then: + // CHECK: store i32 1, ptr %cleanup.dest.slot, align 4 + // CHECK: br label %cleanup + // CHECK: if.end: + // CHECK: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: call void @x(i32 {{.*}} 1) + // CHECK: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 + // CHECK: store i32 0, ptr %cleanup.dest.slot, align 4 + // CHECK: br label %cleanup + // CHECK: cleanup: + // CHECK: %cleanup.dest.saved1 = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: call void @x(i32 {{.*}} 2) + // CHECK: store i32 %cleanup.dest.saved1, ptr %cleanup.dest.slot, align 4 + // CHECK: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: switch i32 %cleanup.dest, label %unreachable [ + // CHECK: i32 0, label %cleanup.cont + // CHECK: i32 1, label %cleanup.cont + // CHECK: ] + // CHECK: cleanup.cont: + // CHECK: ret void + // CHECK: unreachable: + // CHECK: unreachable + defer x(2); + if (ret) return; + defer x(1); +} + +// CHECK-LABEL: define {{.*}} void @ts_g() +void ts_g() { + // CHECK-NEXT: entry: + // CHECK-NEXT: ret void + // CHECK-NEXT: } + return; + defer x(42); +} + +// CHECK-LABEL: define {{.*}} void @ts_h() +void ts_h() { + // CHECK-NEXT: entry: + // CHECK-NEXT: br label %b + // CHECK-EMPTY: + goto b; + { + defer x(42); + } + + // CHECK-NEXT: b: + // CHECK-NEXT: ret void + // CHECK-NEXT: } + b: +} + +// CHECK-LABEL: define {{.*}} void @ts_i() +void ts_i() { + // CHECK: entry: + // CHECK: %cleanup.dest.slot = alloca i32, align 4 + // CHECK: store i32 2, ptr %cleanup.dest.slot, align 4 + // CHECK: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: call void @x(i32 {{.*}} 42) + // CHECK: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 + // CHECK: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: switch i32 %cleanup.dest, label %unreachable [ + // CHECK: i32 2, label %b + // CHECK: ] + // CHECK: b: + // CHECK: ret void + // CHECK: unreachable: + // CHECK: unreachable + { + defer { x(42); } + goto b; + } + b: +} + + +// CHECK-LABEL: define {{.*}} void @ts_m() +void ts_m() { + // CHECK: entry: + // CHECK: br label %b + // CHECK: b: + // CHECK: call void @x(i32 {{.*}} 1) + // CHECK: ret void + goto b; + { + b: + defer x(1); + } +} + +// CHECK-LABEL: define {{.*}} void @ts_p() +void ts_p() { + // CHECK: entry: + // CHECK: br label %b + // CHECK: b: + // CHECK: ret void + { + goto b; + defer x(42); + } + b: +} + +// CHECK-LABEL: define {{.*}} void @ts_r() +void ts_r() { + // CHECK: entry: + // CHECK: br label %b + // CHECK: b: + // CHECK: call void @x(i32 {{.*}} 42) + // CHECK: br label %b + { + b: + defer x(42); + } + goto b; +} + +// CHECK-LABEL: define {{.*}} i32 @return_value() +int return_value() { + // CHECK: entry: + // CHECK: %r = alloca i32, align 4 + // CHECK: %p = alloca ptr, align 8 + // CHECK: store i32 4, ptr %r, align 4 + // CHECK: store ptr %r, ptr %p, align 8 + // CHECK: %0 = load ptr, ptr %p, align 8 + // CHECK: %1 = load i32, ptr %0, align 4 + // CHECK: %2 = load ptr, ptr %p, align 8 + // CHECK: store i32 5, ptr %2, align 4 + // CHECK: ret i32 %1 + int r = 4; + int* p = &r; + defer { *p = 5; } + return *p; +} + +void* malloc(__SIZE_TYPE__ size); +void free(void* ptr); +int use_buffer(__SIZE_TYPE__ size, void* ptr); + +// CHECK-LABEL: define {{.*}} i32 @malloc_free_example() +int malloc_free_example() { + // CHECK: entry: + // CHECK: %size = alloca i32, align 4 + // CHECK: %buf = alloca ptr, align 8 + // CHECK: store i32 20, ptr %size, align 4 + // CHECK: %call = call ptr @malloc(i64 {{.*}} 20) + // CHECK: store ptr %call, ptr %buf, align 8 + // CHECK: %0 = load ptr, ptr %buf, align 8 + // CHECK: %call1 = call i32 @use_buffer(i64 {{.*}} 20, ptr {{.*}} %0) + // CHECK: %1 = load ptr, ptr %buf, align 8 + // CHECK: call void @free(ptr {{.*}} %1) + // CHECK: ret i32 %call1 + const int size = 20; + void* buf = malloc(size); + defer { free(buf); } + return use_buffer(size, buf); +} + +// CHECK-LABEL: define {{.*}} void @sequencing_1() +void sequencing_1() { + // CHECK: entry: + // CHECK: call void @x(i32 {{.*}} 1) + // CHECK: call void @x(i32 {{.*}} 2) + // CHECK: call void @x(i32 {{.*}} 3) + // CHECK: ret void + { + defer { + x(3); + } + if (true) + defer x(1); + x(2); + } +} + +// CHECK-LABEL: define {{.*}} void @sequencing_2() +void sequencing_2() { + // CHECK: entry: + // CHECK: %arr = alloca [3 x i32], align 4 + // CHECK: %i = alloca i32, align 4 + // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %arr, ptr align 4 @__const.sequencing_2.arr, i64 12, i1 false) + // CHECK: store i32 0, ptr %i, align 4 + // CHECK: br label %for.cond + // CHECK: for.cond: + // CHECK: %0 = load i32, ptr %i, align 4 + // CHECK: %cmp = icmp ult i32 %0, 3 + // CHECK: br i1 %cmp, label %for.body, label %for.end + // CHECK: for.body: + // CHECK: %1 = load i32, ptr %i, align 4 + // CHECK: %idxprom = zext i32 %1 to i64 + // CHECK: %arrayidx = getelementptr inbounds nuw [3 x i32], ptr %arr, i64 0, i64 %idxprom + // CHECK: %2 = load i32, ptr %arrayidx, align 4 + // CHECK: call void @x(i32 {{.*}} %2) + // CHECK: br label %for.inc + // CHECK: for.inc: + // CHECK: %3 = load i32, ptr %i, align 4 + // CHECK: %inc = add i32 %3, 1 + // CHECK: store i32 %inc, ptr %i, align 4 + // CHECK: br label %for.cond + // CHECK: for.end: + // CHECK: call void @x(i32 {{.*}} 4) + // CHECK: call void @x(i32 {{.*}} 5) + // CHECK: ret void + { + int arr[] = {1, 2, 3}; + defer { + x(5); + } + for (unsigned i = 0; i < 3; ++i) + defer x(arr[i]); + x(4); + } +} + +// CHECK-LABEL: define {{.*}} void @sequencing_3() +void sequencing_3() { + // CHECK: entry: + // CHECK: %r = alloca i32, align 4 + // CHECK: store i32 0, ptr %r, align 4 + // CHECK: %0 = load i32, ptr %r, align 4 + // CHECK: %add = add nsw i32 %0, 1 + // CHECK: store i32 %add, ptr %r, align 4 + // CHECK: %1 = load i32, ptr %r, align 4 + // CHECK: %mul = mul nsw i32 %1, 2 + // CHECK: store i32 %mul, ptr %r, align 4 + // CHECK: %2 = load i32, ptr %r, align 4 + // CHECK: %add1 = add nsw i32 %2, 3 + // CHECK: store i32 %add1, ptr %r, align 4 + // CHECK: %3 = load i32, ptr %r, align 4 + // CHECK: %mul2 = mul nsw i32 %3, 4 + // CHECK: store i32 %mul2, ptr %r, align 4 + // CHECK: ret void + int r = 0; + { + defer { + defer r *= 4; + r *= 2; + defer { + r += 3; + } + } + defer r += 1; + } +} + +// CHECK-LABEL: define {{.*}} void @defer_stmt(i32 {{.*}} %q) +void defer_stmt(int q) { + // CHECK: entry: + // CHECK: %q.addr = alloca i32, align 4 + // CHECK: store i32 %q, ptr %q.addr, align 4 + // CHECK: %0 = load i32, ptr %q.addr, align 4 + // CHECK: %cmp = icmp eq i32 %0, 3 + // CHECK: br i1 %cmp, label %if.then, label %if.end + // CHECK: if.then: + // CHECK: call void @x(i32 {{.*}} 42) + // CHECK: br label %if.end + // CHECK: if.end: + // CHECK: ret void + defer if (q == 3) x(42); +} + +// CHECK-LABEL: define {{.*}} void @defer_defer() +void defer_defer() { + // CHECK: entry: + // CHECK: call void @x(i32 {{.*}} 0) + // CHECK: call void @x(i32 {{.*}} 1) + // CHECK: call void @x(i32 {{.*}} 2) + // CHECK: call void @x(i32 {{.*}} 3) + // CHECK: call void @x(i32 {{.*}} 4) + // CHECK: ret void + defer x(4); + defer defer x(3); + defer defer defer x(2); + defer defer defer defer x(1); + x(0); +} + +// CHECK-LABEL: define {{.*}} i32 @vla(ptr {{.*}} %p, i32 {{.*}} %x) +int vla(int* p, int x) { + // CHECK: entry: + // CHECK: %retval = alloca i32, align 4 + // CHECK: %p.addr = alloca ptr, align 8 + // CHECK: %x.addr = alloca i32, align 4 + // CHECK: %cleanup.dest.slot = alloca i32, align 4 + // CHECK: %saved_stack = alloca ptr, align 8 + // CHECK: %__vla_expr0 = alloca i64, align 8 + // CHECK: %saved_stack3 = alloca ptr, align 8 + // CHECK: %__vla_expr1 = alloca i64, align 8 + // CHECK: store ptr %p, ptr %p.addr, align 8 + // CHECK: store i32 %x, ptr %x.addr, align 4 + // CHECK: %0 = load i32, ptr %x.addr, align 4 + // CHECK: %cmp = icmp slt i32 %0, 5 + // CHECK: br i1 %cmp, label %if.then, label %if.end + // CHECK: if.then: + // CHECK: store i32 10, ptr %retval, align 4 + // CHECK: store i32 1, ptr %cleanup.dest.slot, align 4 + // CHECK: br label %cleanup + // CHECK: if.end: + // CHECK: store i32 7, ptr %retval, align 4 + // CHECK: store i32 1, ptr %cleanup.dest.slot, align 4 + // CHECK: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: %1 = load i32, ptr %x.addr, align 4 + // CHECK: %2 = zext i32 %1 to i64 + // CHECK: %3 = call ptr @llvm.stacksave.p0() + // CHECK: store ptr %3, ptr %saved_stack, align 8 + // CHECK: %vla = alloca i32, i64 %2, align 16 + // CHECK: store i64 %2, ptr %__vla_expr0, align 8 + // CHECK: %arrayidx = getelementptr inbounds i32, ptr %vla, i64 2 + // CHECK: store i32 4, ptr %arrayidx, align 8 + // CHECK: %arrayidx1 = getelementptr inbounds i32, ptr %vla, i64 2 + // CHECK: %4 = load i32, ptr %arrayidx1, align 8 + // CHECK: %5 = load ptr, ptr %p.addr, align 8 + // CHECK: store i32 %4, ptr %5, align 4 + // CHECK: %6 = load ptr, ptr %saved_stack, align 8 + // CHECK: call void @llvm.stackrestore.p0(ptr %6) + // CHECK: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 + // CHECK: br label %cleanup + // CHECK: cleanup: + // CHECK: %cleanup.dest.saved2 = load i32, ptr %cleanup.dest.slot, align 4 + // CHECK: %7 = load i32, ptr %x.addr, align 4 + // CHECK: %8 = zext i32 %7 to i64 + // CHECK: %9 = call ptr @llvm.stacksave.p0() + // CHECK: store ptr %9, ptr %saved_stack3, align 8 + // CHECK: %vla4 = alloca i32, i64 %8, align 16 + // CHECK: store i64 %8, ptr %__vla_expr1, align 8 + // CHECK: %arrayidx5 = getelementptr inbounds i32, ptr %vla4, i64 2 + // CHECK: store i32 3, ptr %arrayidx5, align 8 + // CHECK: %arrayidx6 = getelementptr inbounds i32, ptr %vla4, i64 2 + // CHECK: %10 = load i32, ptr %arrayidx6, align 8 + // CHECK: %11 = load ptr, ptr %p.addr, align 8 + // CHECK: store i32 %10, ptr %11, align 4 + // CHECK: %12 = load ptr, ptr %saved_stack3, align 8 + // CHECK: call void @llvm.stackrestore.p0(ptr %12) + // CHECK: store i32 %cleanup.dest.saved2, ptr %cleanup.dest.slot, align 4 + // CHECK: %13 = load i32, ptr %retval, align 4 + // CHECK: ret i32 %13 + defer { + int a[x]; + a[2] = 3; + *p = a[2]; + } + if (x < 5) { return 10; } + defer { + int b[x]; + b[2] = 4; + *p = b[2]; + } + return 7; +} + +[[noreturn]] void exit(); +[[noreturn]] void _Exit(); +[[noreturn]] void foobar(); + +// CHECK-LABEL: define {{.*}} i32 @call_exit() +int call_exit() { + // CHECK: entry: + // CHECK: call void @exit() + // CHECK: unreachable + defer x(1); + exit(); +} + +// CHECK-LABEL: define {{.*}} i32 @call__Exit() +int call__Exit() { + // CHECK: entry: + // CHECK: call void @_Exit() + // CHECK: unreachable + defer x(1); + _Exit(); +} + +// CHECK-LABEL: define {{.*}} i32 @call_foobar() +int call_foobar() { + // CHECK: entry: + // CHECK: call void @foobar() + // CHECK: unreachable + defer x(1); + foobar(); +} + +// CHECK-LABEL: define {{.*}} i32 @main() +int main() { + // CHECK: entry: + // CHECK: %retval = alloca i32, align 4 + // CHECK: store i32 0, ptr %retval, align 4 + // CHECK: store i32 5, ptr %retval, align 4 + // CHECK: call void @x(i32 {{.*}} 42) + // CHECK: %0 = load i32, ptr %retval, align 4 + // CHECK: ret i32 %0 + defer x(42); + return 5; +} + +// CHECK-LABEL: define {{.*}} void @t() +// CHECK: entry: +// CHECK-NEXT: %count = alloca i32, align 4 +// CHECK-NEXT: %cleanup.dest.slot = alloca i32, align 4 +// CHECK-NEXT: store i32 0, ptr %count, align 4 +// CHECK-NEXT: br label %target +// CHECK: target: +// CHECK-NEXT: %0 = load i32, ptr %count, align 4 +// CHECK-NEXT: %inc = add nsw i32 %0, 1 +// CHECK-NEXT: store i32 %inc, ptr %count, align 4 +// CHECK-NEXT: %1 = load i32, ptr %count, align 4 +// CHECK-NEXT: %cmp = icmp sle i32 %1, 2 +// CHECK-NEXT: br i1 %cmp, label %if.then, label %if.end +// CHECK: if.then: +// CHECK-NEXT: store i32 2, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: cleanup: +// CHECK-NEXT: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @x(i32 {{.*}} 1) +// CHECK-NEXT: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest, label %unreachable [ +// CHECK-NEXT: i32 0, label %cleanup.cont +// CHECK-NEXT: i32 2, label %target +// CHECK-NEXT: ] +// CHECK: cleanup.cont: +// CHECK-NEXT: call void @x(i32 {{.*}} 2) +// CHECK-NEXT: ret void +// CHECK: unreachable: +// CHECK-NEXT: unreachable +void t() { + int count = 0; + + { + target: + _Defer { x(1); } + ++count; + if (count <= 2) { + goto target; + } + } + + x(2); +} + +// CHECK-LABEL: define {{.*}} void @stmt_expr() +// CHECK: entry: +// CHECK-NEXT: %tmp = alloca i32, align 4 +// CHECK-NEXT: call void @x(i32 {{.*}} 1) +// CHECK-NEXT: call void @x(i32 {{.*}} 2) +// CHECK-NEXT: call void @x(i32 {{.*}} 3) +// CHECK-NEXT: call void @x(i32 {{.*}} 4) +// CHECK-NEXT: store i32 6, ptr %tmp, align 4 +// CHECK-NEXT: call void @x(i32 {{.*}} 5) +// CHECK-NEXT: %0 = load i32, ptr %tmp, align 4 +// CHECK-NEXT: call void @x(i32 {{.*}} %0) +// CHECK-NEXT: ret void +void stmt_expr() { + ({ + _Defer x(4); + _Defer ({ + _Defer x(3); + x(2); + }); + x(1); + }); + + x(({ + _Defer x(5); + 6; + })); +} + +// CHECK-LABEL: define {{.*}} void @cleanup_no_insert_point() +// CHECK: entry: +// CHECK-NEXT: %cleanup.dest.slot = alloca i32, align 4 +// CHECK-NEXT: br label %while.cond +// CHECK: while.cond: +// CHECK-NEXT: %call = call {{.*}} i1 @q(i32 {{.*}} 1) +// CHECK-NEXT: br i1 %call, label %while.body, label %while.end +// CHECK: while.body: +// CHECK-NEXT: %call1 = call {{.*}} i1 @q(i32 {{.*}} 2) +// CHECK-NEXT: br i1 %call1, label %if.then, label %if.end +// CHECK: if.then: +// CHECK-NEXT: store i32 2, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end: +// CHECK-NEXT: %call2 = call {{.*}} i1 @q(i32 {{.*}} 3) +// CHECK-NEXT: br i1 %call2, label %if.then3, label %if.end4 +// CHECK: if.then3: +// CHECK-NEXT: store i32 3, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end4: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: cleanup: +// CHECK-NEXT: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: call void @noreturn() +// CHECK-NEXT: unreachable +// CHECK: 0: +// CHECK-NEXT: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest, label %unreachable [ +// CHECK-NEXT: i32 0, label %cleanup.cont +// CHECK-NEXT: i32 2, label %while.cond +// CHECK-NEXT: i32 3, label %while.end +// CHECK-NEXT: ] +// CHECK: cleanup.cont: +// CHECK-NEXT: br label %while.cond +// CHECK: while.end: +// CHECK-NEXT: ret void +// CHECK: unreachable: +// CHECK-NEXT: unreachable +void cleanup_no_insert_point() { + while (q(1)) { + _Defer { + noreturn(); + }; + if (q(2)) continue; + if (q(3)) break; + } +} + +// CHECK-LABEL: define {{.*}} void @cleanup_nested() +// CHECK: entry: +// CHECK-NEXT: %cleanup.dest.slot = alloca i32, align 4 +// CHECK-NEXT: br label %while.cond +// CHECK: while.cond: +// CHECK-NEXT: %call = call {{.*}} i1 @q(i32 {{.*}} 1) +// CHECK-NEXT: br i1 %call, label %while.body, label %while.end19 +// CHECK: while.body: +// CHECK-NEXT: %call1 = call {{.*}} i1 @q(i32 {{.*}} 6) +// CHECK-NEXT: br i1 %call1, label %if.then, label %if.end +// CHECK: if.then: +// CHECK-NEXT: store i32 2, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end: +// CHECK-NEXT: %call2 = call {{.*}} i1 @q(i32 {{.*}} 7) +// CHECK-NEXT: br i1 %call2, label %if.then3, label %if.end4 +// CHECK: if.then3: +// CHECK-NEXT: store i32 3, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: if.end4: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup +// CHECK: cleanup: +// CHECK-NEXT: %cleanup.dest.saved = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %while.cond5 +// CHECK: while.cond5: +// CHECK-NEXT: %call6 = call {{.*}} i1 @q(i32 {{.*}} 2) +// CHECK-NEXT: br i1 %call6, label %while.body7, label %while.end +// CHECK: while.body7: +// CHECK-NEXT: %call8 = call {{.*}} i1 @q(i32 {{.*}} 4) +// CHECK-NEXT: br i1 %call8, label %if.then9, label %if.end10 +// CHECK: if.then9: +// CHECK-NEXT: store i32 4, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup14 +// CHECK: if.end10: +// CHECK-NEXT: %call11 = call {{.*}} i1 @q(i32 {{.*}} 5) +// CHECK-NEXT: br i1 %call11, label %if.then12, label %if.end13 +// CHECK: if.then12: +// CHECK-NEXT: store i32 5, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup14 +// CHECK: if.end13: +// CHECK-NEXT: store i32 0, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: br label %cleanup14 +// CHECK: cleanup14: +// CHECK-NEXT: %cleanup.dest.saved15 = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %call16 = call {{.*}} i1 @q(i32 {{.*}} 3) +// CHECK-NEXT: store i32 %cleanup.dest.saved15, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest, label %unreachable [ +// CHECK-NEXT: i32 0, label %cleanup.cont +// CHECK-NEXT: i32 4, label %while.cond5 +// CHECK-NEXT: i32 5, label %while.end +// CHECK-NEXT: ] +// CHECK: cleanup.cont: +// CHECK-NEXT: br label %while.cond5 +// CHECK: while.end: +// CHECK-NEXT: store i32 %cleanup.dest.saved, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: %cleanup.dest17 = load i32, ptr %cleanup.dest.slot, align 4 +// CHECK-NEXT: switch i32 %cleanup.dest17, label %unreachable [ +// CHECK-NEXT: i32 0, label %cleanup.cont18 +// CHECK-NEXT: i32 2, label %while.cond +// CHECK-NEXT: i32 3, label %while.end19 +// CHECK-NEXT: ] +// CHECK: cleanup.cont18: +// CHECK-NEXT: br label %while.cond +// CHECK: while.end19: +// CHECK-NEXT: ret void +// CHECK: unreachable: +// CHECK-NEXT: unreachable +void cleanup_nested() { + while (q(1)) { + _Defer { + while (q(2)) { + _Defer { + q(3); + } + if (q(4)) continue; + if (q(5)) break; + } + }; + if (q(6)) continue; + if (q(7)) break; + } +} diff --git a/clang/test/Lexer/defer-keyword.cpp b/clang/test/Lexer/defer-keyword.cpp new file mode 100644 index 0000000000000..929f2c58f974a --- /dev/null +++ b/clang/test/Lexer/defer-keyword.cpp @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsyntax-only -verify -fdefer-ts %s + +// expected-no-diagnostics +int _Defer; diff --git a/clang/test/Parser/defer-ts.c b/clang/test/Parser/defer-ts.c new file mode 100644 index 0000000000000..118fe9ee3cc8f --- /dev/null +++ b/clang/test/Parser/defer-ts.c @@ -0,0 +1,58 @@ +// RUN: %clang_cc1 -std=c11 -fsyntax-only -fdefer-ts -verify %s +// RUN: %clang_cc1 -std=c23 -fsyntax-only -fdefer-ts -verify %s + +#define defer _Defer + +int g(void); +int h(int x); + +void f1(void) { + defer 1; // expected-warning {{expression result unused}} + defer 1 + 1; // expected-warning {{expression result unused}} + defer "a"; // expected-warning {{expression result unused}} + defer "a" "b" "c"; // expected-warning {{expression result unused}} + defer defer 1; // expected-warning {{expression result unused}} + defer defer defer defer 1; // expected-warning {{expression result unused}} + defer (int) 4; // expected-warning {{expression result unused}} + defer g(); + + defer {} + defer { defer {} } + defer { defer {} defer {} } + + defer if (g()) g(); + defer while (g()) g(); + defer for (int i = 0; i < 10; i++) h(i); + defer switch (g()) { case 1: g(); } + + defer; // expected-warning {{defer statement has empty body}} expected-note {{put the semicolon on a separate line}} + defer + ; + + defer a: g(); // expected-error {{substatement of defer must not be a label}} + defer b: {} // expected-error {{substatement of defer must not be a label}} + defer { c: g(); } + + if (g()) defer g(); + while (g()) defer g(); + defer ({}); + ({ defer g(); }); + + defer int x; // expected-error {{expected expression}} + defer void q() {} // expected-error {{expected expression}} +} + +void f2(void) { + [[some, attributes]] defer g(); // expected-warning 2 {{unknown attribute}} + __attribute__((some_attribute)) defer g(); // expected-warning {{unknown attribute}} + [[some, attributes]] defer { g(); } // expected-warning 2 {{unknown attribute}} + __attribute__((some_attribute)) defer { g(); } // expected-warning {{unknown attribute}} +} + +void f3(void) { + _Defer 1; // expected-warning {{expression result unused}} + _Defer {} + _Defer _Defer {} + _Defer { defer {} _Defer {} } + _Defer if (g()) g(); +} diff --git a/clang/test/Parser/defer-ts.cpp b/clang/test/Parser/defer-ts.cpp new file mode 100644 index 0000000000000..fa25cac8575f6 --- /dev/null +++ b/clang/test/Parser/defer-ts.cpp @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fdefer-ts -verify %s + +void f() { + _Defer {} // expected-error {{use of undeclared identifier '_Defer'}} +} diff --git a/clang/test/Preprocessor/defer-ts.c b/clang/test/Preprocessor/defer-ts.c new file mode 100644 index 0000000000000..e4995ac9b23ea --- /dev/null +++ b/clang/test/Preprocessor/defer-ts.c @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -fsyntax-only -fdefer-ts -verify=enabled %s +// RUN: %clang_cc1 -fsyntax-only -verify=disabled %s +// RUN: %clang_cc1 -x c++ -fsyntax-only -fdefer-ts -verify=disabled %s +// RUN: %clang_cc1 -x c++ -fsyntax-only -verify=disabled %s +// enabled-no-diagnostics +#if __STDC_DEFER_TS25755__ != 1 +// disabled-error@+1 {{Should have defined __STDC_DEFER_TS25755__ to 1}} +# error Should have defined __STDC_DEFER_TS25755__ to 1 +#endif diff --git a/clang/test/Sema/defer-ts-seh.c b/clang/test/Sema/defer-ts-seh.c new file mode 100644 index 0000000000000..4b773ed3f09a0 --- /dev/null +++ b/clang/test/Sema/defer-ts-seh.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -std=c23 -fdefer-ts -fms-compatibility -triple x86_64-windows-msvc -fsyntax-only -verify %s + +void f() { + __try { + _Defer { + __leave; // expected-error {{cannot __leave a defer statement}} + } + } __finally {} + + __try { + _Defer { + __try { + __leave; + } __finally {} + } + } __finally {} +} diff --git a/clang/test/Sema/defer-ts-sjlj.c b/clang/test/Sema/defer-ts-sjlj.c new file mode 100644 index 0000000000000..49230fa721e0f --- /dev/null +++ b/clang/test/Sema/defer-ts-sjlj.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -triple x86_64-windows-msvc -std=gnu23 -fdefer-ts -fsyntax-only -fblocks -verify %s + +typedef void** jmp_buf; +typedef void** sigjmp_buf; + +int setjmp(jmp_buf env); +int _setjmp(jmp_buf env); +int sigsetjmp(sigjmp_buf env, int savesigs); +int __sigsetjmp(sigjmp_buf env, int savesigs); +void longjmp(jmp_buf env, int val); +void _longjmp(jmp_buf env, int val); +void siglongjmp(sigjmp_buf env, int val); + +jmp_buf x; +sigjmp_buf y; +void f() { + _Defer { + __builtin_setjmp(x); // expected-error {{cannot use '__builtin_setjmp' inside a defer statement}} + __builtin_longjmp(x, 1); // expected-error {{cannot use '__builtin_longjmp' inside a defer statement}} + setjmp(x); // expected-error {{cannot use 'setjmp' inside a defer statement}} + _setjmp(x); // expected-error {{cannot use '_setjmp' inside a defer statement}} + sigsetjmp(y, 0); // expected-error {{cannot use 'sigsetjmp' inside a defer statement}} + __sigsetjmp(y, 0); // expected-error {{cannot use '__sigsetjmp' inside a defer statement}} + longjmp(x, 0); // expected-error {{cannot use 'longjmp' inside a defer statement}} + _longjmp(x, 0); // expected-error {{cannot use '_longjmp' inside a defer statement}} + siglongjmp(y, 0); // expected-error {{cannot use 'siglongjmp' inside a defer statement}} + + (void) ^{ + __builtin_setjmp(x); + __builtin_longjmp(x, 1); + setjmp(x); + _setjmp(x); + sigsetjmp(y, 0); + __sigsetjmp(y, 0); + longjmp(x, 0); + _longjmp(x, 0); + siglongjmp(y, 0); + + _Defer { + __builtin_setjmp(x); // expected-error {{cannot use '__builtin_setjmp' inside a defer statement}} + __builtin_longjmp(x, 1); // expected-error {{cannot use '__builtin_longjmp' inside a defer statement}} + setjmp(x); // expected-error {{cannot use 'setjmp' inside a defer statement}} + _setjmp(x); // expected-error {{cannot use '_setjmp' inside a defer statement}} + sigsetjmp(y, 0); // expected-error {{cannot use 'sigsetjmp' inside a defer statement}} + __sigsetjmp(y, 0); // expected-error {{cannot use '__sigsetjmp' inside a defer statement}} + longjmp(x, 0); // expected-error {{cannot use 'longjmp' inside a defer statement}} + _longjmp(x, 0); // expected-error {{cannot use '_longjmp' inside a defer statement}} + siglongjmp(y, 0); // expected-error {{cannot use 'siglongjmp' inside a defer statement}} + } + }; + } +} diff --git a/clang/test/Sema/defer-ts.c b/clang/test/Sema/defer-ts.c new file mode 100644 index 0000000000000..95c68fa213eaa --- /dev/null +++ b/clang/test/Sema/defer-ts.c @@ -0,0 +1,172 @@ +// RUN: %clang_cc1 -std=c23 -fdefer-ts -fsyntax-only -verify %s + +#define defer _Defer + +void a(); + +void f1() { + defer { + goto l1; + l1: + } + + defer { + l2: + goto l2; + } +} + +void f2() { + goto l1; // expected-error {{cannot jump from this goto statement to its label}} + defer { // expected-note {{jump enters a defer statement}} + l1: + } + + goto l2; // expected-error {{cannot jump from this goto statement to its label}} + defer {} // expected-note {{jump bypasses defer statement}} + l2: +} + +void f3() { + x: + defer { // expected-note {{jump exits a defer statement}} + goto x; // expected-error {{cannot jump from this goto statement to its label}} + } +} + +void f4() { + defer { // expected-note {{jump exits a defer statement}} + goto y; // expected-error {{cannot jump from this goto statement to its label}} + } + y: +} + +void f5() { + defer { // expected-note {{jump enters a defer statement}} + l2: + } + goto l2; // expected-error {{cannot jump from this goto statement to its label}} +} + +void f6() { + goto b; // expected-error {{cannot jump from this goto statement to its label}} + { + defer {} // expected-note {{jump bypasses defer statement}} + b: + } + + { + defer {} // expected-note {{jump bypasses defer statement}} + b2: + } + goto b2; // expected-error {{cannot jump from this goto statement to its label}} +} + +void f7() { + defer { // expected-note {{jump bypasses defer statement}} + goto cross1; // expected-error {{cannot jump from this goto statement to its label}} + cross2: + } + defer { // expected-note {{jump exits a defer statement}} expected-note {{jump enters a defer statement}} + goto cross2; // expected-error {{cannot jump from this goto statement to its label}} + cross1: + } +} + +void f8() { + defer { + return; // expected-error {{cannot return from a defer statement}} + } + + { + defer { + return; // expected-error {{cannot return from a defer statement}} + } + } + + switch (1) { + case 1: defer { + break; // expected-error {{cannot break out of a defer statement}} + } + } + + for (;;) { + defer { + break; // expected-error {{cannot break out of a defer statement}} + } + } + + for (;;) { + defer { + continue; // expected-error {{cannot continue loop outside of enclosing defer statement}} + } + } + + switch (1) { + defer {} // expected-note {{jump bypasses defer statement}} + default: // expected-error {{cannot jump from switch statement to this case label}} + defer {} + break; + } + + switch (1) { + case 1: { + defer { // expected-note {{jump enters a defer statement}} + case 2: {} // expected-error {{cannot jump from switch statement to this case label}} + } + } + } + + switch (1) { + case 1: defer { + switch (2) { case 2: break; } + } + } + + for (;;) { + defer { for (;;) break; } + } + + for (;;) { + defer { for (;;) continue; } + } +} + +void f9() { + { + defer {} + goto l1; + } + l1: + + { + goto l2; + defer {} + } + l2: + + { + { defer {} } + goto l3; + } + l3: + + { + defer {} + { goto l4; } + } + l4: +} + +void f10(int i) { + switch (i) { + defer case 12: break; // expected-error {{cannot break out of a defer statement}} \ + expected-error {{cannot jump from switch statement to this case label}} \ + expected-note {{jump enters a defer statement}} \ + expected-note {{jump bypasses defer statement}} + + defer default: break; // expected-error {{cannot break out of a defer statement}} \ + expected-error {{cannot jump from switch statement to this case label}} \ + expected-note {{jump enters a defer statement}} + } +} diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index 0a43d73063c1f..c49ca567049c7 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -224,6 +224,11 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, K = CXCursor_ReturnStmt; break; + // Not exposed for now because '_Defer' is currently just a TS. + case Stmt::DeferStmtClass: + K = CXCursor_UnexposedStmt; + break; + case Stmt::GCCAsmStmtClass: K = CXCursor_GCCAsmStmt; break; From 426cedccd38502babad106fb0a9183750fae3092 Mon Sep 17 00:00:00 2001 From: hev Date: Thu, 11 Dec 2025 13:31:03 +0800 Subject: [PATCH 17/49] [LoongArch] Add support for the ud macro instruction (#171583) This patch adds support for the `ud ui5` macro instruction. The `ui5` operand must be inthe range `0-31`. The macro expands to: `amswap.w $rd, $r1, $rj` where `ui5` specifies the register number used for `$rd` in the expanded instruction, and `$rd` is the same as `$rj`. Relevant binutils patch: https://sourceware.org/pipermail/binutils/2025-December/146042.html --- .../Disassembler/LoongArchDisassembler.cpp | 23 +++++++++++++++++++ .../Target/LoongArch/LoongArchInstrFormats.td | 13 +++++++++++ .../Target/LoongArch/LoongArchInstrInfo.td | 17 ++++++++------ llvm/test/CodeGen/LoongArch/trap.ll | 2 +- llvm/test/MC/LoongArch/Basic/Integer/misc.s | 14 ++++++++++- 5 files changed, 60 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp index d4058fac4304a..584b45b4111cd 100644 --- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp +++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp @@ -157,6 +157,29 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm, return MCDisassembler::Success; } +// Decode AMSWAP.W and UD, which share the same base encoding. +// If rk == 1 and rd == rj, interpret the instruction as UD; +// otherwise decode as AMSWAP.W. +static DecodeStatus DecodeAMOrUDInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + unsigned Rd = fieldFromInstruction(Insn, 0, 5); + unsigned Rj = fieldFromInstruction(Insn, 5, 5); + unsigned Rk = fieldFromInstruction(Insn, 10, 5); + + if (Rk == 1 && Rd == Rj) { + Inst.setOpcode(LoongArch::UD); + Inst.addOperand(MCOperand::createImm(Rd)); + } else { + Inst.setOpcode(LoongArch::AMSWAP_W); + Inst.addOperand(MCOperand::createReg(LoongArch::R0 + Rd)); + Inst.addOperand(MCOperand::createReg(LoongArch::R0 + Rk)); + Inst.addOperand(MCOperand::createReg(LoongArch::R0 + Rj)); + } + + return MCDisassembler::Success; +} + #include "LoongArchGenDisassemblerTables.inc" DecodeStatus LoongArchDisassembler::getInstruction(MCInst &MI, uint64_t &Size, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td index 419e20431c59f..fa049fcbc2d21 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td @@ -401,3 +401,16 @@ class FmtLDPTE pattern = []> let Inst{9-5} = rj; let Inst{4-0} = 0b00000; } + +// FmtUD +// <0b0011100001100000000001 | I5 | I5> +class FmtUD pattern = []> + : LAInst.ret, opnstr, pattern> { + bits<5> imm5; + + let Inst{31-10} = 0b0011100001100000000001; + let Inst{9-5} = imm5; + let Inst{4-0} = imm5; + + let DecoderMethod = "DecodeAMOrUDInstruction"; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 2e6653e1a09ac..d971f8bc1986b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -800,6 +800,10 @@ class AM_3R op> let Constraints = "@earlyclobber $rd"; } +class AU_3R op> : AM_3R { + let DecoderMethod = "DecodeAMOrUDInstruction"; +} + class AMCAS_3R op> : Fmt3R { @@ -923,6 +927,9 @@ def BREAK : MISC_I15<0x002a0000>; def RDTIMEL_W : RDTIME_2R<0x00006000>; def RDTIMEH_W : RDTIME_2R<0x00006400>; +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +def UD : FmtUD<(outs), (ins uimm5:$imm5), "$imm5">; + // The CPUCFG instruction offers a reliable way to probing CPU features. // Although support is not guaranteed on LA32R, having compiler support // nevertheless enables applications to rely on its presence, potentially @@ -1087,7 +1094,7 @@ def STLE_D : STORE_3R<0x387f8000>; // Atomic Memory Access Instructions for 64-bits def AMSWAP_B : AM_3R<0x385c0000>; def AMSWAP_H : AM_3R<0x385c8000>; -def AMSWAP_W : AM_3R<0x38600000>; +def AMSWAP_W : AU_3R<0x38600000>; def AMSWAP_D : AM_3R<0x38608000>; def AMADD_B : AM_3R<0x385d0000>; def AMADD_H : AM_3R<0x385d8000>; @@ -1410,12 +1417,8 @@ def : Pat<(and GPR:$rj, BstrinsImm:$imm), /// Traps -// We lower `trap` to `amswap.w rd:$r0, rk:$r1, rj:$r0`, as this is guaranteed -// to trap with an INE (non-existent on LA32, explicitly documented to INE on -// LA64). And the resulting signal is different from `debugtrap` like on some -// other existing ports so programs/porters might have an easier time. -def PseudoUNIMP : Pseudo<(outs), (ins), [(trap)]>, - PseudoInstExpansion<(AMSWAP_W R0, R1, R0)>; +// We lower `trap` to `ud 0`, which is an alias for `amswap.w $r0, $r1, $r0`. +def PseudoUNIMP : Pseudo<(outs), (ins), [(trap)]>, PseudoInstExpansion<(UD 0)>; // We lower `debugtrap` to `break 0`, as this is guaranteed to exist and work, // even for LA32 Primary. Also, because so far the ISA does not provide a diff --git a/llvm/test/CodeGen/LoongArch/trap.ll b/llvm/test/CodeGen/LoongArch/trap.ll index 15a7ad82bd7a8..d433266b47e47 100644 --- a/llvm/test/CodeGen/LoongArch/trap.ll +++ b/llvm/test/CodeGen/LoongArch/trap.ll @@ -10,7 +10,7 @@ declare void @llvm.debugtrap() define void @test_trap() nounwind { ; CHECK-LABEL: test_trap: ; CHECK: # %bb.0: -; CHECK-NEXT: amswap.w $zero, $ra, $zero +; CHECK-NEXT: ud 0 ; CHECK-NEXT: ret tail call void @llvm.trap() ret void diff --git a/llvm/test/MC/LoongArch/Basic/Integer/misc.s b/llvm/test/MC/LoongArch/Basic/Integer/misc.s index 182d1da9b237e..26a9205d8e17d 100644 --- a/llvm/test/MC/LoongArch/Basic/Integer/misc.s +++ b/llvm/test/MC/LoongArch/Basic/Integer/misc.s @@ -7,7 +7,7 @@ # RUN: llvm-mc %s --triple=loongarch32 --filetype=obj | llvm-objdump -d - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # RUN: llvm-mc %s --triple=loongarch64 --filetype=obj --defsym=LA64=1 | llvm-objdump -d - \ -# RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ,CHECK64-ASM-AND-OBJ %s +# RUN: | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ,CHECK64-OBJ,CHECK64-ASM-AND-OBJ %s ############################################################# ## Instructions for both loongarch32 and loongarch64 @@ -33,6 +33,13 @@ rdtimeh.w $a7, $a1 # CHECK-ASM: encoding: [0x03,0x6d,0x00,0x00] cpucfg $sp, $a4 +# CHECK-ASM-AND-OBJ: ud 0 +# CHECK-ASM: encoding: [0x00,0x04,0x60,0x38] +ud 0 + +# CHECK-ASM-AND-OBJ: ud 31 +# CHECK-ASM: encoding: [0xff,0x07,0x60,0x38] +ud 31 ############################################################# ## Instructions only for loongarch64 @@ -40,6 +47,11 @@ cpucfg $sp, $a4 .ifdef LA64 +# CHECK64-OBJ: ud 0 +# CHECK64-ASM: amswap.w $zero, $ra, $zero +# CHECK64-ASM: encoding: [0x00,0x04,0x60,0x38] +amswap.w $zero, $ra, $zero + # CHECK64-ASM-AND-OBJ: asrtle.d $t0, $t5 # CHECK64-ASM: encoding: [0x80,0x45,0x01,0x00] asrtle.d $t0, $t5 From 4f9d5a8bc85431b722e6f90744f3683adffc17b4 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Thu, 11 Dec 2025 11:04:04 +0530 Subject: [PATCH 18/49] [RISCV] Generate Xqcilsm LWMI/SWMI load/store multiple instructions (#171079) This patch adds support for generating the Xqcilsm load/store multiple instructions as a part of the RISCVLoadStoreOptimizer pass. For now we only combine two load/store instructions into a load/store multiple. Support for converting more loads/stores will be added in follow-up patches. These instructions are only applicable for 32-bit loads/stores with an alignment of 4-bytes. --- .../Target/RISCV/RISCVLoadStoreOptimizer.cpp | 122 ++++++- llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir | 315 ++++++++++++++++++ 2 files changed, 427 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir diff --git a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp index a22ab6bfc04b8..f1827dcf174f3 100644 --- a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp @@ -70,6 +70,12 @@ struct RISCVLoadStoreOpt : public MachineFunctionPass { // Convert load/store pairs to single instructions. bool tryConvertToLdStPair(MachineBasicBlock::iterator First, MachineBasicBlock::iterator Second); + bool tryConvertToXqcilsmLdStPair(MachineFunction *MF, + MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Second); + bool tryConvertToMIPSLdStPair(MachineFunction *MF, + MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Second); // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. @@ -114,7 +120,7 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { ModifiedRegUnits.init(*TRI); UsedRegUnits.init(*TRI); - if (Subtarget.useMIPSLoadStorePairs()) { + if (Subtarget.useMIPSLoadStorePairs() || Subtarget.hasVendorXqcilsm()) { for (MachineBasicBlock &MBB : Fn) { LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n"); @@ -168,14 +174,93 @@ bool RISCVLoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { return false; } -// Merge two adjacent load/store instructions into a paired instruction -// (LDP/SDP/SWP/LWP) if the effective address is 8-byte aligned in case of -// SWP/LWP 16-byte aligned in case of LDP/SDP. This function selects the -// appropriate paired opcode, verifies that the memory operand is properly -// aligned, and checks that the offset is valid. If all conditions are met, it -// builds and inserts the paired instruction. -bool RISCVLoadStoreOpt::tryConvertToLdStPair( - MachineBasicBlock::iterator First, MachineBasicBlock::iterator Second) { +bool RISCVLoadStoreOpt::tryConvertToXqcilsmLdStPair( + MachineFunction *MF, MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Second) { + unsigned Opc = First->getOpcode(); + if ((Opc != RISCV::LW && Opc != RISCV::SW) || Second->getOpcode() != Opc) + return false; + + const auto &FirstOp1 = First->getOperand(1); + const auto &SecondOp1 = Second->getOperand(1); + const auto &FirstOp2 = First->getOperand(2); + const auto &SecondOp2 = Second->getOperand(2); + + // Require simple reg+imm addressing for both. + if (!FirstOp1.isReg() || !SecondOp1.isReg() || !FirstOp2.isImm() || + !SecondOp2.isImm()) + return false; + + Register Base1 = FirstOp1.getReg(); + Register Base2 = SecondOp1.getReg(); + + if (Base1 != Base2) + return false; + + const MachineMemOperand *MMO = *First->memoperands_begin(); + Align MMOAlign = MMO->getAlign(); + + if (MMOAlign < Align(4)) + return false; + + auto &FirstOp0 = First->getOperand(0); + auto &SecondOp0 = Second->getOperand(0); + + int64_t Off1 = FirstOp2.getImm(); + int64_t Off2 = SecondOp2.getImm(); + + if (Off2 < Off1) { + std::swap(FirstOp0, SecondOp0); + std::swap(Off1, Off2); + } + + Register StartReg = FirstOp0.getReg(); + Register NextReg = SecondOp0.getReg(); + + if (StartReg == RISCV::X0 || NextReg == RISCV::X0) + return false; + + // If the base reg gets overwritten by one of the loads then bail out. + if (Opc == RISCV::LW && (StartReg == Base1 || NextReg == Base1)) + return false; + + if (!isShiftedUInt<5, 2>(Off1) || (Off2 - Off1 != 4)) + return false; + + if (NextReg != StartReg + 1) + return false; + + unsigned XqciOpc = (Opc == RISCV::LW) ? RISCV::QC_LWMI : RISCV::QC_SWMI; + + auto StartRegState = (Opc == RISCV::LW) ? RegState::Define + : getKillRegState(FirstOp0.isKill()); + auto NextRegState = + (Opc == RISCV::LW) + ? RegState::ImplicitDefine + : (RegState::Implicit | getKillRegState(SecondOp0.isKill())); + + DebugLoc DL = + First->getDebugLoc() ? First->getDebugLoc() : Second->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*MF, DL, TII->get(XqciOpc)); + MIB.addReg(StartReg, StartRegState) + .addReg(Base1, getKillRegState(FirstOp1.isKill() || SecondOp1.isKill())) + .addImm(2) + .addImm(Off1) + .cloneMergedMemRefs({&*First, &*Second}) + .addReg(NextReg, NextRegState); + + First->getParent()->insert(First, MIB); + First->removeFromParent(); + Second->removeFromParent(); + + return true; +} + +bool RISCVLoadStoreOpt::tryConvertToMIPSLdStPair( + MachineFunction *MF, MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Second) { + // Try converting to SWP/LWP/LDP/SDP. + // SWP/LWP requires 8-byte alignment whereas LDP/SDP needs 16-byte alignment. unsigned PairOpc; Align RequiredAlignment; switch (First->getOpcode()) { @@ -199,7 +284,6 @@ bool RISCVLoadStoreOpt::tryConvertToLdStPair( break; } - MachineFunction *MF = First->getMF(); const MachineMemOperand *MMO = *First->memoperands_begin(); Align MMOAlign = MMO->getAlign(); @@ -227,6 +311,24 @@ bool RISCVLoadStoreOpt::tryConvertToLdStPair( return true; } +// Merge two adjacent load/store instructions into a paired instruction. +// This function calls the vendor specific implementation that seelects the +// appropriate paired opcode, verifies that the memory operand is properly +// aligned, and checks that the offset is valid. If all conditions are met, it +// builds and inserts the paired instruction. +bool RISCVLoadStoreOpt::tryConvertToLdStPair( + MachineBasicBlock::iterator First, MachineBasicBlock::iterator Second) { + MachineFunction *MF = First->getMF(); + const RISCVSubtarget &STI = MF->getSubtarget(); + + // Try converting to QC_LWMI/QC_SWMI if the XQCILSM extension is enabled. + if (!STI.is64Bit() && STI.hasVendorXqcilsm()) + return tryConvertToXqcilsmLdStPair(MF, First, Second); + + // Else try to convert them into MIPS Paired Loads/Stores. + return tryConvertToMIPSLdStPair(MF, First, Second); +} + static bool mayAlias(MachineInstr &MIa, SmallVectorImpl &MemInsns, AliasAnalysis *AA) { diff --git a/llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir b/llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir new file mode 100644 index 0000000000000..396f67326a7ca --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xqcilsm-lwmi-swmi.mir @@ -0,0 +1,315 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm -run-pass=riscv-load-store-opt %s -o - | FileCheck %s + +--- | + + define void @pair_two_lw_into_qc_lwmi() nounwind { ret void } + define void @pair_two_lw_into_qc_lwmi_reversed() nounwind { ret void } + define void @pair_two_sw_into_qc_swmi_reversed() nounwind { ret void } + define void @no_pair_if_different_base_regs() nounwind { ret void } + define void @no_pair_if_alignment_lt_4() nounwind { ret void } + define void @pair_two_sw_into_qc_swmi() nounwind { ret void } + define void @no_pair_if_misaligned() nounwind { ret void } + define void @pair_at_upper_boundary_lw() nounwind { ret void } + define void @pair_at_upper_boundary_sw() nounwind { ret void } + define void @no_pair_if_offset_out_of_range_lw() nounwind { ret void } + define void @no_pair_if_offset_out_of_range_sw() nounwind { ret void } + define void @no_pair_if_non_consecutive_regs() nounwind { ret void } + define void @no_pair_if_rd_is_x0() nounwind { ret void } + define void @no_pair_if_lw_rd_equals_base() nounwind { ret void } + define void @pair_if_not_adjacent() nounwind { ret void } + define void @pair_if_not_adjacent_use() nounwind { ret void } + define void @no_pair_if_not_adjacent_use() nounwind { ret void } +--- +name: pair_two_lw_into_qc_lwmi +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: pair_two_lw_into_qc_lwmi + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = QC_LWMI $x10, 2, 0, implicit-def $x13 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x12 = LW $x10, 0 :: (load (s32), align 4) + $x13 = LW $x10, 4 :: (load (s32), align 4) + PseudoRET + +... +--- +# FIXME: Kill flags are not propagated correctly for the base register +name: pair_two_lw_into_qc_lwmi_reversed +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: pair_two_lw_into_qc_lwmi_reversed + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = QC_LWMI $x10, 2, 0, implicit-def $x13 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x13 = LW $x10, 4 :: (load (s32)) + $x12 = LW killed $x10, 0 :: (load (s32)) + PseudoRET + +... +--- +name: pair_two_sw_into_qc_swmi_reversed +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x12, $x13 + ; CHECK-LABEL: name: pair_two_sw_into_qc_swmi_reversed + ; CHECK: liveins: $x10, $x12, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: QC_SWMI killed $x12, $x10, 2, 0, implicit killed $x13 :: (store (s32)) + ; CHECK-NEXT: PseudoRET + SW killed $x13, $x10, 4 :: (store (s32)) + SW killed $x12, $x10, 0 :: (store (s32)) + PseudoRET + +... +--- +name: no_pair_if_different_base_regs +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x11 + ; CHECK-LABEL: name: no_pair_if_different_base_regs + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = LW $x10, 0 :: (load (s32)) + ; CHECK-NEXT: $x13 = LW $x11, 4 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x12 = LW $x10, 0 :: (load (s32)) + $x13 = LW $x11, 4 :: (load (s32)) + PseudoRET + +... +--- +name: no_pair_if_alignment_lt_4 +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: no_pair_if_alignment_lt_4 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = LW $x10, 0 :: (load (s32)) + ; CHECK-NEXT: $x13 = LW $x10, 3 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x12 = LW $x10, 0 :: (load (s32)) + $x13 = LW $x10, 3 :: (load (s32)) + PseudoRET + +... +--- +name: pair_two_sw_into_qc_swmi +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x12, $x13 + ; CHECK-LABEL: name: pair_two_sw_into_qc_swmi + ; CHECK: liveins: $x10, $x12, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: QC_SWMI killed $x12, $x10, 2, 0, implicit killed $x13 :: (store (s32)) + ; CHECK-NEXT: PseudoRET + SW killed $x12, $x10, 0 :: (store (s32), align 4) + SW killed $x13, $x10, 4 :: (store (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_misaligned +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: no_pair_if_misaligned + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = LW $x10, 2 :: (load (s32)) + ; CHECK-NEXT: $x13 = LW $x10, 6 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x12 = LW $x10, 2 :: (load (s32), align 4) + $x13 = LW $x10, 6 :: (load (s32), align 4) + PseudoRET + +... +--- +# FIXME: Kill flags are not propagated correctly for the base register +name: pair_at_upper_boundary_lw +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: pair_at_upper_boundary_lw + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = QC_LWMI $x10, 2, 124, implicit-def $x13 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x12 = LW $x10, 124 :: (load (s32), align 4) + $x13 = LW killed $x10, 128 :: (load (s32), align 4) + PseudoRET + +... +--- +# FIXME: Kill flags are not propagated correctly for the base register +name: pair_at_upper_boundary_sw +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x12, $x13 + ; CHECK-LABEL: name: pair_at_upper_boundary_sw + ; CHECK: liveins: $x10, $x12, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: QC_SWMI $x12, $x10, 2, 124, implicit $x13 :: (store (s32)) + ; CHECK-NEXT: PseudoRET + SW $x12, $x10, 124 :: (store (s32), align 4) + SW $x13, killed $x10, 128 :: (store (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_offset_out_of_range_lw +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: no_pair_if_offset_out_of_range_lw + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x12 = LW $x10, 128 :: (load (s32)) + ; CHECK-NEXT: $x13 = LW $x10, 132 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x12 = LW $x10, 128 :: (load (s32), align 4) + $x13 = LW $x10, 132 :: (load (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_offset_out_of_range_sw +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x12, $x13 + ; CHECK-LABEL: name: no_pair_if_offset_out_of_range_sw + ; CHECK: liveins: $x10, $x12, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SW $x12, $x10, 128 :: (store (s32)) + ; CHECK-NEXT: SW $x13, $x10, 132 :: (store (s32)) + ; CHECK-NEXT: PseudoRET + SW $x12, $x10, 128 :: (store (s32), align 4) + SW $x13, $x10, 132 :: (store (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_non_consecutive_regs +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: no_pair_if_non_consecutive_regs + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x11 = LW $x10, 0 :: (load (s32)) + ; CHECK-NEXT: $x13 = LW $x10, 4 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x11 = LW $x10, 0 :: (load (s32), align 4) + $x13 = LW $x10, 4 :: (load (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_rd_is_x0 +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: no_pair_if_rd_is_x0 + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = LW $x10, 0 :: (load (s32)) + ; CHECK-NEXT: $x1 = LW $x10, 4 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x0 = LW $x10, 0 :: (load (s32), align 4) + $x1 = LW $x10, 4 :: (load (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_lw_rd_equals_base +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: no_pair_if_lw_rd_equals_base + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x10 = LW $x10, 20 :: (load (s32)) + ; CHECK-NEXT: $x11 = LW $x10, 24 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x10 = LW $x10, 20 :: (load (s32), align 4) + $x11 = LW $x10, 24 :: (load (s32), align 4) + PseudoRET + +... +--- +# FIXME: Kill flags are not propagated correctly for the base register +name: pair_if_not_adjacent +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10 + ; CHECK-LABEL: name: pair_if_not_adjacent + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x1 = QC_LWMI $x10, 2, 20, implicit-def $x2 :: (load (s32)) + ; CHECK-NEXT: $x3 = ADDI $x1, 10 + ; CHECK-NEXT: PseudoRET + $x1 = LW $x10, 20 :: (load (s32), align 4) + $x3 = ADDI $x1, 10 + $x2 = LW killed $x10, 24 :: (load (s32), align 4) + PseudoRET + +... +--- +name: pair_if_not_adjacent_use +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x1, $x2 + ; CHECK-LABEL: name: pair_if_not_adjacent_use + ; CHECK: liveins: $x10, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = ADDI $x2, 10 + ; CHECK-NEXT: QC_SWMI $x1, $x10, 2, 20, implicit $x2 :: (store (s32)) + ; CHECK-NEXT: PseudoRET + SW $x1, $x10, 20 :: (store (s32), align 4) + $x2 = ADDI $x2, 10 + SW $x2, $x10, 24 :: (store (s32), align 4) + PseudoRET + +... +--- +name: no_pair_if_not_adjacent_use +tracksRegLiveness: false +body: | + bb.0: + liveins: $x10, $x2 + ; CHECK-LABEL: name: no_pair_if_not_adjacent_use + ; CHECK: liveins: $x10, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x1 = LW $x10, 20 :: (load (s32)) + ; CHECK-NEXT: $x1 = ADDI $x1, 10 + ; CHECK-NEXT: SW $x2, $x10, 40 :: (store (s32)) + ; CHECK-NEXT: $x2 = LW $x10, 24 :: (load (s32)) + ; CHECK-NEXT: PseudoRET + $x1 = LW $x10, 20 :: (load (s32), align 4) + $x1 = ADDI $x1, 10 + SW $x2, $x10, 40 :: (store (s32), align 4) + $x2 = LW $x10, 24 :: (load (s32), align 4) + PseudoRET + +... From cb4b6ad8171e9f3dd991b0d7fa1771bcafc3c0a6 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Thu, 11 Dec 2025 06:53:50 +0000 Subject: [PATCH 19/49] [CIR] Add the ability to detect if SwitchOp covers all the cases (#171246) --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 17 +++++-- clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 2 + clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 38 ---------------- clang/test/CIR/CodeGen/atomic.c | 6 +-- clang/test/CIR/CodeGen/switch.cpp | 48 ++++++++++++++------ clang/test/CIR/CodeGen/switch_flat_op.cpp | 2 +- clang/test/CIR/IR/switch.cir | 32 ++++++++++++- clang/test/CIR/Transforms/switch-fold.cir | 10 ++-- 8 files changed, 90 insertions(+), 65 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index c4ad6a0c0732c..74e0860762ec6 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -1080,6 +1080,12 @@ def CIR_SwitchOp : CIR_Op<"switch", [ conditionally executing multiple regions of code. The operand to an switch is an integral condition value. + Besides taking an integer condition and CIR regions, it also accepts an + `all_enum_cases_covered` attribute indicating whether all enum cases are + handled by the operation. Note that the presence of a default CaseOp does + not imply `all_enum_cases_covered`. The original AST switch must explicitly list + every enum case. + The set of `cir.case` operations and their enclosing `cir.switch` represent the semantics of a C/C++ switch statement. Users can use `collectCases(llvm::SmallVector &cases)` to collect the `cir.case` @@ -1206,7 +1212,10 @@ def CIR_SwitchOp : CIR_Op<"switch", [ ``` }]; - let arguments = (ins CIR_IntType:$condition); + let arguments = (ins + CIR_IntType:$condition, + UnitAttr:$allEnumCasesCovered + ); let regions = (region AnyRegion:$body); @@ -1217,9 +1226,9 @@ def CIR_SwitchOp : CIR_Op<"switch", [ ]; let assemblyFormat = [{ - custom( - $body, $condition, type($condition) - ) + `(` $condition `:` qualified(type($condition)) `)` + (`allEnumCasesCovered` $allEnumCasesCovered^)? + $body attr-dict }]; diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index f13e7cb32c71e..b7bd405bf4df4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -1105,6 +1105,8 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) { terminateBody(builder, caseOp.getCaseRegion(), caseOp.getLoc()); terminateBody(builder, swop.getBody(), swop.getLoc()); + swop.setAllEnumCasesCovered(s.isAllEnumCasesCovered()); + return res; } diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 8077dc6597047..d888fdcf081e7 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -1359,44 +1359,6 @@ void cir::CaseOp::build(OpBuilder &builder, OperationState &result, // SwitchOp //===----------------------------------------------------------------------===// -static ParseResult parseSwitchOp(OpAsmParser &parser, mlir::Region ®ions, - mlir::OpAsmParser::UnresolvedOperand &cond, - mlir::Type &condType) { - cir::IntType intCondType; - - if (parser.parseLParen()) - return mlir::failure(); - - if (parser.parseOperand(cond)) - return mlir::failure(); - if (parser.parseColon()) - return mlir::failure(); - if (parser.parseCustomTypeWithFallback(intCondType)) - return mlir::failure(); - condType = intCondType; - - if (parser.parseRParen()) - return mlir::failure(); - if (parser.parseRegion(regions, /*arguments=*/{}, /*argTypes=*/{})) - return failure(); - - return mlir::success(); -} - -static void printSwitchOp(OpAsmPrinter &p, cir::SwitchOp op, - mlir::Region &bodyRegion, mlir::Value condition, - mlir::Type condType) { - p << "("; - p << condition; - p << " : "; - p.printStrippedAttrOrType(condType); - p << ")"; - - p << ' '; - p.printRegion(bodyRegion, /*printEntryBlockArgs=*/false, - /*printBlockTerminators=*/true); -} - void cir::SwitchOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ion) { if (!point.isParent()) { diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c index 64e0961fe20d9..5fbbee0e88a15 100644 --- a/clang/test/CIR/CodeGen/atomic.c +++ b/clang/test/CIR/CodeGen/atomic.c @@ -1143,7 +1143,7 @@ int atomic_load_dynamic_order(int *ptr, int order) { // CIR: %[[PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr>, !cir.ptr // CIR-NEXT: %[[ORDER:.+]] = cir.load align(4) %{{.+}} : !cir.ptr, !s32i - // CIR-NEXT: cir.switch (%[[ORDER]] : !s32i) { + // CIR-NEXT: cir.switch(%[[ORDER]] : !s32i) { // CIR-NEXT: cir.case(default, []) { // CIR-NEXT: %[[RES:.+]] = cir.load align(4) syncscope(system) atomic(relaxed) %[[PTR]] : !cir.ptr, !s32i // CIR-NEXT: cir.store align(4) %[[RES]], %[[RES_SLOT:.+]] : !s32i, !cir.ptr @@ -1219,7 +1219,7 @@ void atomic_store_dynamic_order(int *ptr, int order) { // CIR: %[[PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr>, !cir.ptr // CIR-NEXT: %[[ORDER:.+]] = cir.load align(4) %{{.+}} : !cir.ptr, !s32i - // CIR: cir.switch (%[[ORDER]] : !s32i) { + // CIR: cir.switch(%[[ORDER]] : !s32i) { // CIR-NEXT: cir.case(default, []) { // CIR-NEXT: %[[VALUE:.+]] = cir.load align(4) %{{.+}} : !cir.ptr, !s32i // CIR-NEXT: cir.store align(4) atomic(relaxed) %[[VALUE]], %[[PTR]] : !s32i, !cir.ptr @@ -1288,7 +1288,7 @@ int atomic_load_and_store_dynamic_order(int *ptr, int order) { // CIR: %[[PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr>, !cir.ptr // CIR-NEXT: %[[ORDER:.+]] = cir.load align(4) %{{.+}} : !cir.ptr, !s32i - // CIR: cir.switch (%[[ORDER]] : !s32i) { + // CIR: cir.switch(%[[ORDER]] : !s32i) { // CIR-NEXT: cir.case(default, []) { // CIR-NEXT: %[[LIT:.+]] = cir.load align(4) %{{.+}} : !cir.ptr, !s32i // CIR-NEXT: %[[RES:.+]] = cir.atomic.xchg relaxed %[[PTR]], %[[LIT]] : (!cir.ptr, !s32i) -> !s32i diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp index 3824be0d08c2f..b7bd2da5e39b8 100644 --- a/clang/test/CIR/CodeGen/switch.cpp +++ b/clang/test/CIR/CodeGen/switch.cpp @@ -20,7 +20,7 @@ void sw1(int a) { } // CIR: cir.func{{.*}} @_Z3sw1i -// CIR: cir.switch (%[[COND:.*]] : !s32i) { +// CIR: cir.switch(%[[COND:.*]] : !s32i) { // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) { // CIR: cir.break // CIR: cir.case(equal, [#cir.int<1> : !s32i]) { @@ -101,7 +101,7 @@ void sw2(int a) { // CIR: cir.scope { // CIR-NEXT: %[[YOLO:.*]] = cir.alloca !s32i, !cir.ptr, ["yolo", init] // CIR-NEXT: %[[FOMO:.*]] = cir.alloca !s32i, !cir.ptr, ["fomo", init] -// CIR: cir.switch (%[[COND:.*]] : !s32i) { +// CIR: cir.switch(%[[COND:.*]] : !s32i) { // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) { // CIR-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i // CIR-NEXT: cir.store{{.*}} %[[ZERO]], %[[FOMO]] : !s32i, !cir.ptr @@ -154,7 +154,7 @@ void sw3(int a) { // CIR: cir.func{{.*}} @_Z3sw3i // CIR: cir.scope { // CIR-NEXT: %[[COND:.*]] = cir.load{{.*}} %[[A:.*]] : !cir.ptr, !s32i -// CIR-NEXT: cir.switch (%[[COND]] : !s32i) { +// CIR-NEXT: cir.switch(%[[COND]] : !s32i) { // CIR-NEXT: cir.case(default, []) { // CIR-NEXT: cir.break // CIR-NEXT: } @@ -196,7 +196,7 @@ int sw4(int a) { } // CIR: cir.func{{.*}} @_Z3sw4i -// CIR: cir.switch (%[[COND:.*]] : !s32i) { +// CIR: cir.switch(%[[COND:.*]] : !s32i) { // CIR-NEXT: cir.case(equal, [#cir.int<42> : !s32i]) { // CIR-NEXT: cir.scope { // CIR-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i @@ -264,7 +264,7 @@ void sw5(int a) { } // CIR: cir.func{{.*}} @_Z3sw5i -// CIR: cir.switch (%[[A:.*]] : !s32i) { +// CIR: cir.switch(%[[A:.*]] : !s32i) { // CIR-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) { // CIR-NEXT: cir.yield // CIR-NEXT: } @@ -314,7 +314,7 @@ void sw6(int a) { } // CIR: cir.func{{.*}} @_Z3sw6i -// CIR: cir.switch (%[[A:.*]] : !s32i) { +// CIR: cir.switch(%[[A:.*]] : !s32i) { // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) { // CIR-NEXT: cir.yield // CIR-NEXT: } @@ -406,7 +406,7 @@ void sw7(int a) { // CIR: cir.func{{.*}} @_Z3sw7i // CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr, ["x"] -// CIR: cir.switch (%[[A:.*]] : !s32i) +// CIR: cir.switch(%[[A:.*]] : !s32i) // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) { // CIR-NEXT: cir.yield // CIR-NEXT: } @@ -499,7 +499,7 @@ void sw8(int a) { } // CIR: cir.func{{.*}} @_Z3sw8i -// CIR: cir.switch (%[[A:.*]] : !s32i) +// CIR: cir.switch(%[[A:.*]] : !s32i) // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) { // CIR-NEXT: cir.break // CIR-NEXT: } @@ -557,7 +557,7 @@ void sw9(int a) { } // CIR: cir.func{{.*}} @_Z3sw9i -// CIR: cir.switch (%[[A:.*]] : !s32i) +// CIR: cir.switch(%[[A:.*]] : !s32i) // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) { // CIR-NEXT: cir.break // CIR-NEXT: } @@ -616,7 +616,7 @@ void sw10(int a) { } // CIR: cir.func{{.*}} @_Z4sw10i -// CIR: cir.switch (%[[A:.*]] : !s32i) +// CIR: cir.switch(%[[A:.*]] : !s32i) // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) { // CIR-NEXT: cir.break // CIR-NEXT: } @@ -688,7 +688,7 @@ void sw11(int a) { } // CIR: cir.func{{.*}} @_Z4sw11i -// CIR: cir.switch (%[[A:.*]] : !s32i) +// CIR: cir.switch(%[[A:.*]] : !s32i) // CIR-NEXT: cir.case(equal, [#cir.int<3> : !s32i]) { // CIR-NEXT: cir.break // CIR-NEXT: } @@ -1063,7 +1063,7 @@ int nested_switch(int a) { return 0; } -// CIR: cir.switch (%[[COND:.*]] : !s32i) { +// CIR: cir.switch(%[[COND:.*]] : !s32i) { // CIR: cir.case(equal, [#cir.int<0> : !s32i]) { // CIR: cir.yield // CIR: } @@ -1198,7 +1198,7 @@ int sw_return_multi_cases(int x) { } // CIR-LABEL: cir.func{{.*}} @_Z21sw_return_multi_casesi -// CIR: cir.switch (%{{.*}} : !s32i) { +// CIR: cir.switch(%{{.*}} : !s32i) { // CIR-NEXT: cir.case(equal, [#cir.int<0> : !s32i]) { // CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i // CIR: cir.store{{.*}} %[[ZERO]], %{{.*}} : !s32i, !cir.ptr @@ -1270,3 +1270,25 @@ int sw_return_multi_cases(int x) { // OGCG: [[RETURN]]: // OGCG: %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4 // OGCG: ret i32 %[[RETVAL_LOAD]] + +enum M { + Six, + Seven +}; + +void testSwitchCoverAllCase(M m) { + switch (m) { + case Six:case Seven: + break; + } +} +// CIR: cir.switch(%[[ARG:.*]] : !s32i) allEnumCasesCovered { + +void testSwitchNotCoverAllCase(M m) { + switch (m) { + case Six: + default: + break; + } +} +// CIR: cir.switch(%[[ARG:.*]] : !s32i) { diff --git a/clang/test/CIR/CodeGen/switch_flat_op.cpp b/clang/test/CIR/CodeGen/switch_flat_op.cpp index a3ea7e7a15547..ba0a82da52c70 100644 --- a/clang/test/CIR/CodeGen/switch_flat_op.cpp +++ b/clang/test/CIR/CodeGen/switch_flat_op.cpp @@ -21,7 +21,7 @@ void swf(int a) { // BEFORE: cir.func{{.*}} @_Z3swfi // BEFORE: %[[VAR_B:.*]] = cir.alloca !s32i, !cir.ptr, ["b", init] {alignment = 4 : i64} // BEFORE: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i -// BEFORE: cir.switch (%[[COND:.*]] : !s32i) { +// BEFORE: cir.switch(%[[COND:.*]] : !s32i) { // BEFORE: cir.case(equal, [#cir.int<3> : !s32i]) { // BEFORE: %[[LOAD_B_EQ:.*]] = cir.load{{.*}} %[[VAR_B]] : !cir.ptr, !s32i // BEFORE: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i diff --git a/clang/test/CIR/IR/switch.cir b/clang/test/CIR/IR/switch.cir index 87d45bf1f5219..89614480e43cd 100644 --- a/clang/test/CIR/IR/switch.cir +++ b/clang/test/CIR/IR/switch.cir @@ -21,7 +21,7 @@ cir.func @s0() { cir.return } -// CHECK: cir.switch (%0 : !s32i) { +// CHECK: cir.switch(%0 : !s32i) { // CHECK-NEXT: cir.case(default, []) { // CHECK-NEXT: cir.return // CHECK-NEXT: } @@ -36,3 +36,33 @@ cir.func @s0() { // CHECK-NEXT: } // CHECK-NEXT: cir.yield // CHECK-NEXT: } + + +// Pretends that this is lowered from a C file and was tagged with allEnumCasesCovered = true +cir.func @s1(%1 : !s32i) { + cir.switch (%1 : !s32i) allEnumCasesCovered { + cir.case (default, []) { + cir.return + } + cir.case (equal, [#cir.int<1> : !s32i]) { + cir.yield + } + cir.case (equal, [#cir.int<2> : !s32i]) { + cir.yield + } + cir.yield + } { } + cir.return +} +// CHECK: cir.switch(%[[ARG:.*]] : !s32i) allEnumCasesCovered { +// CHECK-NEXT: cir.case(default, []) { +// CHECK-NEXT: cir.return +// CHECK-NEXT: } +// CHECK-NEXT: cir.case(equal, [#cir.int<1> : !s32i]) { +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: cir.case(equal, [#cir.int<2> : !s32i]) { +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } diff --git a/clang/test/CIR/Transforms/switch-fold.cir b/clang/test/CIR/Transforms/switch-fold.cir index 62a94f4fde2c3..c348a05128671 100644 --- a/clang/test/CIR/Transforms/switch-fold.cir +++ b/clang/test/CIR/Transforms/switch-fold.cir @@ -27,7 +27,7 @@ module { cir.return } //CHECK: cir.func @foldCascade - //CHECK: cir.switch (%[[COND:.*]] : !s32i) { + //CHECK: cir.switch(%[[COND:.*]] : !s32i) { //CHECK-NEXT: cir.case(anyof, [#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i]) { //CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i //CHECK-NEXT: cir.store %[[TWO]], %[[ARG0:.*]] : !s32i, !cir.ptr @@ -66,7 +66,7 @@ module { cir.return } //CHECK: @foldCascade2 - //CHECK: cir.switch (%[[COND2:.*]] : !s32i) { + //CHECK: cir.switch(%[[COND2:.*]] : !s32i) { //CHECK: cir.case(anyof, [#cir.int<0> : !s32i, #cir.int<2> : !s32i, #cir.int<4> : !s32i]) { //CHECK: cir.break //cehck: } @@ -106,7 +106,7 @@ module { cir.return } //CHECK: cir.func @foldCascade3 - //CHECK: cir.switch (%[[COND3:.*]] : !s32i) { + //CHECK: cir.switch(%[[COND3:.*]] : !s32i) { //CHECK: cir.case(anyof, [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i]) { //CHECK: cir.break //CHECK: } @@ -142,7 +142,7 @@ module { cir.return } //CHECK: cir.func @foldCascadeWithDefault - //CHECK: cir.switch (%[[COND:.*]] : !s32i) { + //CHECK: cir.switch(%[[COND:.*]] : !s32i) { //CHECK: cir.case(equal, [#cir.int<3> : !s32i]) { //CHECK: cir.break //CHECK: } @@ -187,7 +187,7 @@ module { cir.return } //CHECK: cir.func @foldAllCascade - //CHECK: cir.switch (%[[COND:.*]] : !s32i) { + //CHECK: cir.switch(%[[COND:.*]] : !s32i) { //CHECK: cir.case(anyof, [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i]) { //CHECK: cir.yield //CHECK: } From 488202935ed9857ad2ad44cb25e99c4365eea2e0 Mon Sep 17 00:00:00 2001 From: Adar Dagan <101581112+Adar-Dagan@users.noreply.github.com> Date: Thu, 11 Dec 2025 09:13:23 +0200 Subject: [PATCH 20/49] [ValueTracking] Enhance overflow computation for unsigned mul (#171568) Changed the range computation in computeOverflowForUnsignedMul to use computeConstantRange as well. This expands the patterns that InstCombine manages to narrow a mul that has values that come from zext, for example if a value comes from a div operation then the known bits doesn't give the narrowest possible range for that value. --------- Co-authored-by: Adar Dagan --- llvm/lib/Analysis/ValueTracking.cpp | 10 ++++----- llvm/test/Transforms/InstCombine/mul.ll | 28 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 9cb6f19b9340c..92577cd7517e6 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7286,15 +7286,15 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS, const Value *RHS, const SimplifyQuery &SQ, bool IsNSW) { - KnownBits LHSKnown = computeKnownBits(LHS, SQ); - KnownBits RHSKnown = computeKnownBits(RHS, SQ); + ConstantRange LHSRange = + computeConstantRangeIncludingKnownBits(LHS, /*ForSigned=*/false, SQ); + ConstantRange RHSRange = + computeConstantRangeIncludingKnownBits(RHS, /*ForSigned=*/false, SQ); // mul nsw of two non-negative numbers is also nuw. - if (IsNSW && LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) + if (IsNSW && LHSRange.isAllNonNegative() && RHSRange.isAllNonNegative()) return OverflowResult::NeverOverflows; - ConstantRange LHSRange = ConstantRange::fromKnownBits(LHSKnown, false); - ConstantRange RHSRange = ConstantRange::fromKnownBits(RHSKnown, false); return mapOverflowResult(LHSRange.unsignedMulMayOverflow(RHSRange)); } diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll index 0f3137cdd0be3..615f905b7e58a 100644 --- a/llvm/test/Transforms/InstCombine/mul.ll +++ b/llvm/test/Transforms/InstCombine/mul.ll @@ -2202,3 +2202,31 @@ define i8 @mul_not_nsw_nonneg(i8 %x, i8 %y) { %mul = mul i8 %x, %y ret i8 %mul } + +define i16 @mul_udiv_zext(i8 %x) { +; CHECK-LABEL: @mul_udiv_zext( +; CHECK-NEXT: [[X_FR:%.*]] = freeze i8 [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = urem i8 [[X_FR]], 15 +; CHECK-NEXT: [[NARROW:%.*]] = sub nuw i8 [[X_FR]], [[TMP1]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[NARROW]] to i16 +; CHECK-NEXT: ret i16 [[ZEXT]] +; + %div = udiv i8 %x, 15 + %zext = zext i8 %div to i16 + %mul = mul i16 %zext, 15 + ret i16 %mul +} + +define i16 @mul_udiv_zext_uneq(i8 %x) { +; CHECK-LABEL: @mul_udiv_zext_uneq( +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[X:%.*]], 20 +; CHECK-NEXT: [[NARROW:%.*]] = mul nuw i8 [[DIV]], 15 +; CHECK-NEXT: [[MUL:%.*]] = zext i8 [[NARROW]] to i16 +; CHECK-NEXT: ret i16 [[MUL]] +; + %div = udiv i8 %x, 20 + %zext = zext i8 %div to i16 + %mul = mul i16 %zext, 15 + ret i16 %mul +} + From c9648d7acd2686142a4f77a37ff225caed620ca8 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 11 Dec 2025 08:13:48 +0100 Subject: [PATCH 21/49] [Verifier] Make sure all constexprs in instructions are visited (#171643) Previously this only happened for constants of some types and missed incorrect ptrtoaddr. --- llvm/lib/IR/Verifier.cpp | 13 +++++------- .../Assembler/ptrtoaddr-invalid-constexpr.ll | 20 ++++++++++++++++--- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 040e6efe4c6e3..5a191615daa0b 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2676,6 +2676,9 @@ void Verifier::verifyFunctionMetadata( } void Verifier::visitConstantExprsRecursively(const Constant *EntryC) { + if (EntryC->getNumOperands() == 0) + return; + if (!ConstantExprVisited.insert(EntryC).second) return; @@ -5610,14 +5613,8 @@ void Verifier::visitInstruction(Instruction &I) { } else if (isa(I.getOperand(i))) { Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i), "Cannot take the address of an inline asm!", &I); - } else if (auto *CPA = dyn_cast(I.getOperand(i))) { - visitConstantExprsRecursively(CPA); - } else if (ConstantExpr *CE = dyn_cast(I.getOperand(i))) { - if (CE->getType()->isPtrOrPtrVectorTy()) { - // If we have a ConstantExpr pointer, we need to see if it came from an - // illegal bitcast. - visitConstantExprsRecursively(CE); - } + } else if (auto *C = dyn_cast(I.getOperand(i))) { + visitConstantExprsRecursively(C); } } diff --git a/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll b/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll index 665deff4cd04b..2857f77ff695b 100644 --- a/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll +++ b/llvm/test/Assembler/ptrtoaddr-invalid-constexpr.ll @@ -51,6 +51,20 @@ @g = global i32 ptrtoaddr (ptr @g to i32) ; DST_NOT_ADDR_SIZE-NEXT: PtrToAddr result must be address width ; DST_NOT_ADDR_SIZE-NEXT: i32 ptrtoaddr (ptr @g to i32) -@g_vec = global <4 x i32> ptrtoaddr (<4 x ptr> to <4 x i32>) -; TODO: Verifier.cpp does not visit ConstantVector/ConstantStruct values -; TODO-DST_NOT_ADDR_SIZE: PtrToAddr result must be address width +@g_vec = global <4 x i32> ptrtoaddr (<4 x ptr> to <4 x i32>) +; DST_NOT_ADDR_SIZE-NEXT: PtrToAddr result must be address width +; DST_NOT_ADDR_SIZE-NEXT: i32 ptrtoaddr (ptr @g_vec to i32) + +;--- dst_not_addr_size_in_inst.ll +; RUN: not llvm-as %t/dst_not_addr_size_in_inst.ll -o /dev/null 2>&1 | FileCheck -check-prefix=DST_NOT_ADDR_SIZE_IN_INST %s --implicit-check-not="error:" +; DST_NOT_ADDR_SIZE_IN_INST: PtrToAddr result must be address width +; DST_NOT_ADDR_SIZE_IN_INST-NEXT: i32 ptrtoaddr (ptr @fn to i32) +define i32 @fn() { + ret i32 ptrtoaddr (ptr @fn to i32) +} + +; DST_NOT_ADDR_SIZE_IN_INST: PtrToAddr result must be address width +; DST_NOT_ADDR_SIZE_IN_INST-NEXT: i32 ptrtoaddr (ptr @fn2 to i32) +define <2 x i32> @fn2() { + ret <2 x i32> +} From 6a25e454d6443a518e1460a9125ed60d0470fe83 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 11 Dec 2025 08:14:44 +0100 Subject: [PATCH 22/49] [ConstantFolding] Support ptrtoaddr in ConstantFoldCompareInstOperands (#162653) This folds `icmp (ptrtoaddr x, ptrtoaddr y)` to `icmp (x, y)`, matching the existing ptrtoint fold. Restrict both folds to only the case where the result type matches the address type. I think that all folds this can do in practice end up actually being valid for ptrtoint to a type large than the address size as well, but I don't really see a way to justify this generically without making assumptions about what kind of folding the recursive calls may do. This is based on the icmp semantics specified in https://github.com/llvm/llvm-project/pull/163936. --- llvm/lib/Analysis/ConstantFolding.cpp | 22 ++--- .../test/Transforms/InstSimplify/ptrtoaddr.ll | 82 +++++++++++++++++++ 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index b39b32042dd2f..a9b51065a1d99 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1223,11 +1223,12 @@ Constant *llvm::ConstantFoldCompareInstOperands( } } - // Only do this transformation if the int is intptrty in size, otherwise - // there is a truncation or extension that we aren't modeling. - if (CE0->getOpcode() == Instruction::PtrToInt) { - Type *IntPtrTy = DL.getIntPtrType(CE0->getOperand(0)->getType()); - if (CE0->getType() == IntPtrTy) { + // icmp only compares the address part of the pointer, so only do this + // transform if the integer size matches the address size. + if (CE0->getOpcode() == Instruction::PtrToInt || + CE0->getOpcode() == Instruction::PtrToAddr) { + Type *AddrTy = DL.getAddressType(CE0->getOperand(0)->getType()); + if (CE0->getType() == AddrTy) { Constant *C = CE0->getOperand(0); Constant *Null = Constant::getNullValue(C->getType()); return ConstantFoldCompareInstOperands(Predicate, C, Null, DL, TLI); @@ -1250,11 +1251,12 @@ Constant *llvm::ConstantFoldCompareInstOperands( return ConstantFoldCompareInstOperands(Predicate, C0, C1, DL, TLI); } - // Only do this transformation if the int is intptrty in size, otherwise - // there is a truncation or extension that we aren't modeling. - if (CE0->getOpcode() == Instruction::PtrToInt) { - Type *IntPtrTy = DL.getIntPtrType(CE0->getOperand(0)->getType()); - if (CE0->getType() == IntPtrTy && + // icmp only compares the address part of the pointer, so only do this + // transform if the integer size matches the address size. + if (CE0->getOpcode() == Instruction::PtrToInt || + CE0->getOpcode() == Instruction::PtrToAddr) { + Type *AddrTy = DL.getAddressType(CE0->getOperand(0)->getType()); + if (CE0->getType() == AddrTy && CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()) { return ConstantFoldCompareInstOperands( Predicate, CE0->getOperand(0), CE1->getOperand(0), DL, TLI); diff --git a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll index d06b520931b92..eaccf15cd80f6 100644 --- a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll +++ b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll @@ -316,3 +316,85 @@ define ptr @gep_gep_inv_ptrtoaddr(ptr %p) { %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.inv ret ptr %gep2 } + +define i1 @icmp_ptrtoaddr_0() { +; CHECK-LABEL: define i1 @icmp_ptrtoaddr_0() { +; CHECK-NEXT: ret i1 true +; + %cmp = icmp ne i64 ptrtoaddr (ptr @g to i64), 0 + ret i1 %cmp +} + +; This fails to fold because we currently don't assume that globals are located +; at a non-null address for non-default address spaces. +define i1 @icmp_ptrtoaddr_0_addrsize() { +; CHECK-LABEL: define i1 @icmp_ptrtoaddr_0_addrsize() { +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %cmp = icmp ne i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), 0 + ret i1 %cmp +} + +define i1 @icmp_ptrtoint_0_addrsize() { +; CHECK-LABEL: define i1 @icmp_ptrtoint_0_addrsize() { +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %cmp = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), 0 + ret i1 %cmp +} + +define i1 @icmp_ptrtoaddr_ptrtoaddr() { +; CHECK-LABEL: define i1 @icmp_ptrtoaddr_ptrtoaddr() { +; CHECK-NEXT: ret i1 true +; + %cmp = icmp ne i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr @g2 to i64) + ret i1 %cmp +} + +define i1 @icmp_ptrtoaddr_ptrtoaddr_addrsize() { +; CHECK-LABEL: define i1 @icmp_ptrtoaddr_ptrtoaddr_addrsize() { +; CHECK-NEXT: ret i1 true +; + %cmp = icmp ne i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), ptrtoaddr (ptr addrspace(1) @g2.as1 to i32) + ret i1 %cmp +} + +; This could still be folded because the address being non-equal also implies +; that all pointer bits together are non-equal. +define i1 @icmp_ptrtoint_ptrtoint_addrsize() { +; CHECK-LABEL: define i1 @icmp_ptrtoint_ptrtoint_addrsize() { +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) @g2.as1 to i64) +; CHECK-NEXT: ret i1 [[CMP]] +; + %cmp = icmp ne i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) @g2.as1 to i64) + ret i1 %cmp +} + +define i1 @icmp_relational_ptrtoaddr_ptrtoaddr() { +; CHECK-LABEL: define i1 @icmp_relational_ptrtoaddr_ptrtoaddr() { +; CHECK-NEXT: ret i1 true +; + %cmp = icmp ult i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr getelementptr inbounds (i8, ptr @g, i64 1) to i64) + ret i1 %cmp +} + +define i1 @icmp_relational_ptrtoaddr_ptrtoaddr_addrsize() { +; CHECK-LABEL: define i1 @icmp_relational_ptrtoaddr_ptrtoaddr_addrsize() { +; CHECK-NEXT: ret i1 true +; + %cmp = icmp ult i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), ptrtoaddr (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g.as1, i32 1) to i32) + ret i1 %cmp +} + +; This could still be folded because we know that the non-address bits must be +; the same, as GEP does not modify them. +define i1 @icmp_relational_ptrtoint_ptrtoint_addrsize() { +; CHECK-LABEL: define i1 @icmp_relational_ptrtoint_ptrtoint_addrsize() { +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g.as1, i64 1) to i64) +; CHECK-NEXT: ret i1 [[CMP]] +; + %cmp = icmp ult i64 ptrtoint (ptr addrspace(1) @g.as1 to i64), ptrtoint (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g.as1, i64 1) to i64) + ret i1 %cmp +} From 39a723edbb66e881ae009fdf1c630c3909242a03 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 11 Dec 2025 13:09:14 +0530 Subject: [PATCH 23/49] [Linalg] Add *Conv2D* matchers (#168362) -- This commit is the fourth in the series of adding matchers for linalg.*conv*/*pool*. Refer: https://github.com/llvm/llvm-project/pull/163724 -- In this commit all variants of Conv2D convolution ops have been added. -- It also refactors the way these matchers work to make adding more matchers concise. Signed-off-by: Abhishek Varma --------- Signed-off-by: Abhishek Varma Signed-off-by: hanhanW Co-authored-by: hanhanW --- .../Dialect/Linalg/Transforms/Specialize.cpp | 15 + mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 569 +++++++++++++++++- .../convolution/roundtrip-convolution.mlir | 222 ++++++- 3 files changed, 791 insertions(+), 15 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp index c2485a08932dd..bbfbd2e9736a1 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp @@ -279,6 +279,17 @@ static FailureOr specializeLinalgConvolutions(RewriterBase &rewriter, CONV_OP_SPECIALIZER(linalg::Conv1DNwcWcfOp); CONV_OP_SPECIALIZER(linalg::Conv1DNcwFcwOp); CONV_OP_SPECIALIZER(linalg::Conv2DOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNhwcHwcfOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNhwcHwcfQOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNhwcFhwcOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNhwcFhwcQOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNchwFchwOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNchwFchwQOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNgchwFgchwOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNgchwGfchwOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNgchwGfchwQOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNhwgcGfhwcOp); + CONV_OP_SPECIALIZER(linalg::Conv2DNhwgcGfhwcQOp); CONV_OP_SPECIALIZER(linalg::Conv3DOp); // ----------------------------- // Depthwise Convolution ops. @@ -287,6 +298,10 @@ static FailureOr specializeLinalgConvolutions(RewriterBase &rewriter, CONV_OP_SPECIALIZER(linalg::DepthwiseConv1DNwcWcOp); CONV_OP_SPECIALIZER(linalg::DepthwiseConv1DNwcWcmOp); CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNchwChwOp); + CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcOp); + CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcQOp); + CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcmOp); + CONV_OP_SPECIALIZER(linalg::DepthwiseConv2DNhwcHwcmQOp); CONV_OP_SPECIALIZER(linalg::DepthwiseConv3DNdhwcDhwcmOp); // ----------------------------- // Pooling ops. diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 01e6e1e248658..1244be90390e2 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -240,8 +240,8 @@ bool isReductionIterator(utils::IteratorType iteratorType) { //===----------------------------------------------------------------------===// /// Returns the BlockArgument that leads to `val`, if any. Traverses optional -/// ext* ops. -static BlockArgument getBlockArgumentWithOptionalExtOps(Value val) { +/// ext*/sitofp ops. +static BlockArgument getBlockArgumentWithOptionalCastOps(Value val) { BlockArgument blockArg = dyn_cast(val); if ((blockArg)) return blockArg; @@ -249,18 +249,82 @@ static BlockArgument getBlockArgumentWithOptionalExtOps(Value val) { Operation *defOp = val.getDefiningOp(); if (!dyn_cast_if_present(defOp) && !dyn_cast_if_present(defOp) && - !dyn_cast_if_present(defOp)) { + !dyn_cast_if_present(defOp) && + !dyn_cast_if_present(defOp)) { return nullptr; } return dyn_cast(defOp->getOperand(0)); } +/// Utility function to match the zero point offset body of quantized +/// convolution ops. +/// +/// Quantized convolutions have a body of the form: +/// %out + ((%input - %inputZp) * (%filter - %filterZp)) +/// where: +/// - %input is the input tensor element (block arg 0) +/// - %filter is the filter tensor element (block arg 1) +/// - %inputZp is the input zero-point scalar (block arg 2) +/// - %filterZp is the filter zero-point scalar (block arg 3) +/// - %out is the output accumulator (block arg 4) +/// +/// This function verifies that the multiplication operands are subtraction +/// operations matching this pattern. +static bool bodyMatcherForZeroPointOffsets(Operation *addOp, Operation *mulOp, + Block *body) { + // The multiplication should have two subtraction operands: + // one for (input - inputZp) and one for (filter - filterZp). + Operation *inputSubOp = mulOp->getOperand(0).getDefiningOp(); + if (!isa_and_present(inputSubOp)) + return false; + + Operation *filterSubOp = mulOp->getOperand(1).getDefiningOp(); + if (!isa_and_present(filterSubOp)) + return false; + + // Extract block arguments from subtraction operands. + BlockArgument inputBlockArg = + getBlockArgumentWithOptionalCastOps(inputSubOp->getOperand(0)); + BlockArgument inputZpBlockArg = + getBlockArgumentWithOptionalCastOps(inputSubOp->getOperand(1)); + BlockArgument filterBlockArg = + getBlockArgumentWithOptionalCastOps(filterSubOp->getOperand(0)); + BlockArgument filterZpBlockArg = + getBlockArgumentWithOptionalCastOps(filterSubOp->getOperand(1)); + BlockArgument outBlockArg = + getBlockArgumentWithOptionalCastOps(addOp->getOperand(0)); + + // Verify all block arguments are valid. + if (!inputBlockArg || !inputZpBlockArg || !filterBlockArg || + !filterZpBlockArg || !outBlockArg) + return false; + + // Verify all block arguments belong to the convolution body. + if (inputBlockArg.getOwner() != body || inputZpBlockArg.getOwner() != body || + filterBlockArg.getOwner() != body || + filterZpBlockArg.getOwner() != body || outBlockArg.getOwner() != body) + return false; + + // Verify block arguments have expected indices: + // arg0: input, arg1: filter, arg2: inputZp, arg3: filterZp, arg4: output + if (inputBlockArg.getArgNumber() != 0 || filterBlockArg.getArgNumber() != 1 || + inputZpBlockArg.getArgNumber() != 2 || + filterZpBlockArg.getArgNumber() != 3 || outBlockArg.getArgNumber() != 4) + return false; + + return true; +} + /// Utility to match block body for convolution ops. /// The body is thus expected to yield :- /// %out + (%lhs * %rhs) /// where: %lhs, %rhs and %out are block arguments and /// %lhs and %rhs can have optional upcast operation. -static bool bodyMatcherForConvolutionOps(Value yieldVal, Block *body) { +/// NOTE: In case of zero point offset convolution ops %lhs and %rhs would be :- +/// %input - %input_scalar +/// where, %input_scalar can have optional upcast operation. +static bool bodyMatcherForConvolutionOps(Value yieldVal, Block *body, + bool containsZeroPointOffset = false) { Operation *addOp = yieldVal.getDefiningOp(); if (!isa_and_present(addOp)) return false; @@ -269,12 +333,15 @@ static bool bodyMatcherForConvolutionOps(Value yieldVal, Block *body) { if (!isa_and_present(mulOp)) return false; + if (containsZeroPointOffset) { + return bodyMatcherForZeroPointOffsets(addOp, mulOp, body); + } BlockArgument lhsBlockArg = - getBlockArgumentWithOptionalExtOps(mulOp->getOperand(0)); + getBlockArgumentWithOptionalCastOps(mulOp->getOperand(0)); BlockArgument rhsBlockArg = - getBlockArgumentWithOptionalExtOps(mulOp->getOperand(1)); + getBlockArgumentWithOptionalCastOps(mulOp->getOperand(1)); BlockArgument outBlockArg = - getBlockArgumentWithOptionalExtOps(addOp->getOperand(0)); + getBlockArgumentWithOptionalCastOps(addOp->getOperand(0)); if (!lhsBlockArg || !rhsBlockArg || !outBlockArg || lhsBlockArg.getOwner() != body || rhsBlockArg.getOwner() != body || outBlockArg.getOwner() != body || lhsBlockArg.getArgNumber() != 0 || @@ -291,9 +358,9 @@ static bool bodyMatcherForPoolOps(Value yieldVal, Block *body) { return false; BlockArgument lhsArg = - getBlockArgumentWithOptionalExtOps(defOp->getOperand(0)); + getBlockArgumentWithOptionalCastOps(defOp->getOperand(0)); BlockArgument rhsArg = - getBlockArgumentWithOptionalExtOps(defOp->getOperand(1)); + getBlockArgumentWithOptionalCastOps(defOp->getOperand(1)); if (!lhsArg || !rhsArg || lhsArg.getOwner() != body || rhsArg.getOwner() != body || lhsArg.getArgNumber() != 2 || rhsArg.getArgNumber() != 0) @@ -502,14 +569,15 @@ class ConvMatcherBuilder { } /// Match body pattern. This should be called last. - bool matchBody() { + bool matchBody(bool zeroPointOffset = false) { if (!matched) return false; Block *body = op.getBlock(); auto yieldOp = cast(body->getTerminator()); switch (poolingType) { case PoolingType::None: - return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body); + return bodyMatcherForConvolutionOps(yieldOp.getOperand(0), body, + zeroPointOffset); case PoolingType::MaxSigned: return bodyMatcherForMaxSignedPoolOps(yieldOp.getOperand(0), body); case PoolingType::MaxUnsigned: @@ -634,6 +702,361 @@ bool isaConvolutionOpOfType(LinalgOp op, .matchBody(); } +// #inputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)> +// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (h, w, c, F)> +// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr F = m.dim(3); + AffineExpr h = m.dim(4); + AffineExpr w = m.dim(5); + AffineExpr c = m.dim(6); + + return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c}, + /*filterMap=*/{h, w, c, F}, + /*outputMap=*/{N, H, W, F}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)> +// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (h, w, c, F)> +// #scalarMap = affine_map<(N, H, W, F, h, w, c) -> ()> +// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr F = m.dim(3); + AffineExpr h = m.dim(4); + AffineExpr w = m.dim(5); + AffineExpr c = m.dim(6); + + return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c}, + /*filterMap=*/{h, w, c, F}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, H, W, F}}) + .matchBody(/*zeroPointOffset=*/true); +} + +// #inputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)> +// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (F, h, w, c)> +// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr F = m.dim(3); + AffineExpr h = m.dim(4); + AffineExpr w = m.dim(5); + AffineExpr c = m.dim(6); + + return m.matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c}, + /*filterMap=*/{F, h, w, c}, + /*outputMap=*/{N, H, W, F}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H + h, W + w, c)> +// #filterMap = affine_map<(N, H, W, F, h, w, c) -> (F, h, w, c)> +// #scalarMap = affine_map<(N, H, W, F, h, w, c) -> ()> +// #outputMap = affine_map<(N, H, W, F, h, w, c) -> (N, H, W, F)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr F = m.dim(3); + AffineExpr h = m.dim(4); + AffineExpr w = m.dim(5); + AffineExpr c = m.dim(6); + + return m.matchStride(/*iDim=*/1, /*fDim=*/1, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), c}, + /*filterMap=*/{F, h, w, c}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, H, W, F}}) + .matchBody(/*zeroPointOffset=*/true); +} + +// #inputMap = affine_map<(N, F, H, W, c, h, w) -> (N, c, H + h, W + w)> +// #filterMap = affine_map<(N, F, H, W, c, h, w) -> (F, c, h, w)> +// #outputMap = affine_map<(N, F, H, W, c, h, w) -> (N, F, H, W)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr F = m.dim(1); + AffineExpr H = m.dim(2); + AffineExpr W = m.dim(3); + AffineExpr c = m.dim(4); + AffineExpr h = m.dim(5); + AffineExpr w = m.dim(6); + + return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0) + .matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, c, m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{F, c, h, w}, + /*outputMap=*/{N, F, H, W}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, F, H, W, c, h, w) -> (N, c, H + h, W + w)> +// #filterMap = affine_map<(N, F, H, W, c, h, w) -> (F, c, h, w)> +// #scalarMap = affine_map<(N, F, H, W, c, h, w) -> ()> +// #outputMap = affine_map<(N, F, H, W, c, h, w) -> (N, F, H, W)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr F = m.dim(1); + AffineExpr H = m.dim(2); + AffineExpr W = m.dim(3); + AffineExpr c = m.dim(4); + AffineExpr h = m.dim(5); + AffineExpr w = m.dim(6); + + return m.matchStride(/*iDim=*/2, /*fDim=*/2, /*oDim=*/2, /*idx=*/0) + .matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, c, m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{F, c, h, w}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, F, H, W}}) + .matchBody(/*zeroPointOffset=*/true); +} + +// #inputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, c, H + h, W + w)> +// #filterMap = affine_map<(N, G, F, H, W, c, h, w) -> (F, G, c, h, w)> +// #outputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, F, H, W)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr G = m.dim(1); + AffineExpr F = m.dim(2); + AffineExpr H = m.dim(3); + AffineExpr W = m.dim(4); + AffineExpr c = m.dim(5); + AffineExpr h = m.dim(6); + AffineExpr w = m.dim(7); + + return m.matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/0) + .matchStride(/*iDim=*/4, /*fDim=*/4, /*oDim=*/4, /*idx=*/1) + .matchMaps( + {/*inputMap=*/{N, G, c, m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{F, G, c, h, w}, + /*outputMap=*/{N, G, F, H, W}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, c, H + h, W + w)> +// #filterMap = affine_map<(N, G, F, H, W, c, h, w) -> (G, F, c, h, w)> +// #outputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, F, H, W)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr G = m.dim(1); + AffineExpr F = m.dim(2); + AffineExpr H = m.dim(3); + AffineExpr W = m.dim(4); + AffineExpr c = m.dim(5); + AffineExpr h = m.dim(6); + AffineExpr w = m.dim(7); + + return m.matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/0) + .matchStride(/*iDim=*/4, /*fDim=*/4, /*oDim=*/4, /*idx=*/1) + .matchMaps( + {/*inputMap=*/{N, G, c, m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{G, F, c, h, w}, + /*outputMap=*/{N, G, F, H, W}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, c, H + h, W + w)> +// #filterMap = affine_map<(N, G, F, H, W, c, h, w) -> (G, F, c, h, w)> +// #scalarMap = affine_map<(N, G, F, H, W, c, h, w) -> ()> +// #outputMap = affine_map<(N, G, F, H, W, c, h, w) -> (N, G, F, H, W)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr G = m.dim(1); + AffineExpr F = m.dim(2); + AffineExpr H = m.dim(3); + AffineExpr W = m.dim(4); + AffineExpr c = m.dim(5); + AffineExpr h = m.dim(6); + AffineExpr w = m.dim(7); + + return m.matchStride(/*iDim=*/3, /*fDim=*/3, /*oDim=*/3, /*idx=*/0) + .matchStride(/*iDim=*/4, /*fDim=*/4, /*oDim=*/4, /*idx=*/1) + .matchMaps( + {/*inputMap=*/{N, G, c, m.strided(H, h, 0), m.strided(W, w, 1)}, + /*filterMap=*/{G, F, c, h, w}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, G, F, H, W}}) + .matchBody(/*zeroPointOffset=*/true); +} + +// #inputMap = affine_map<(N, H, W, G, F, h, w, c) -> (N, H + h, W + w, G, c)> +// #filterMap = affine_map<(N, H, W, G, F, h, w, c) -> (G, F, h, w, c)> +// #outputMap = affine_map<(N, H, W, G, F, h, w, c) -> (N, H, W, G, F)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr G = m.dim(3); + AffineExpr F = m.dim(4); + AffineExpr h = m.dim(5); + AffineExpr w = m.dim(6); + AffineExpr c = m.dim(7); + + return m.matchStride(/*iDim=*/1, /*fDim=*/2, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/3, /*oDim=*/2, /*idx=*/1) + .matchMaps( + {/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), G, c}, + /*filterMap=*/{G, F, h, w, c}, + /*outputMap=*/{N, H, W, G, F}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, H, W, G, F, h, w, c) -> (N, H + h, W + w, G, c)> +// #filterMap = affine_map<(N, H, W, G, F, h, w, c) -> (G, F, h, w, c)> +// #scalarMap = affine_map<(N, H, W, G, F, h, w, c) -> ()> +// #outputMap = affine_map<(N, H, W, G, F, h, w, c) -> (N, H, W, G, F)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr G = m.dim(3); + AffineExpr F = m.dim(4); + AffineExpr h = m.dim(5); + AffineExpr w = m.dim(6); + AffineExpr c = m.dim(7); + + return m.matchStride(/*iDim=*/1, /*fDim=*/2, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/3, /*oDim=*/2, /*idx=*/1) + .matchMaps( + {/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), G, c}, + /*filterMap=*/{G, F, h, w, c}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, H, W, G, F}}) + .matchBody(/*zeroPointOffset=*/true); +} + // #inputMap = affine_map<(D, H, W, d, h, w) -> (D + d, H + h, W + w)> // #filterMap = affine_map<(D, H, W, d, h, w) -> (d, h, w)> // #outputMap = affine_map<(D, H, W, d, h, w) -> (D, H, W)> @@ -773,6 +1196,130 @@ bool isaConvolutionOpOfType( .matchBody(); } +// #inputMap = affine_map<(N, H, W, C, h, w) -> (N, H + h, W + w, C)> +// #filterMap = affine_map<(N, H, W, C, h, w) -> (h, w, C)> +// #outputMap = affine_map<(N, H, W, C, h, w) -> (N, H, W, C)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr C = m.dim(3); + AffineExpr h = m.dim(4); + AffineExpr w = m.dim(5); + + return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w, C}, + /*outputMap=*/{N, H, W, C}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, H, W, C, h, w) -> (N, H + h, W + w, C)> +// #filterMap = affine_map<(N, H, W, C, h, w) -> (h, w, C)> +// #scalarMap = affine_map<(N, H, W, C, h, w) -> ()> +// #outputMap = affine_map<(N, H, W, C, h, w) -> (N, H, W, C)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr C = m.dim(3); + AffineExpr h = m.dim(4); + AffineExpr w = m.dim(5); + + return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w, C}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, H, W, C}}) + .matchBody(/*zeroPointOffset=*/true); +} + +// #inputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H + h, W + w, C)> +// #filterMap = affine_map<(N, H, W, C, CM, h, w) -> (h, w, C, CM)> +// #outputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H, W, C, CM)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr C = m.dim(3); + AffineExpr CM = m.dim(4); + AffineExpr h = m.dim(5); + AffineExpr w = m.dim(6); + + return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w, C, CM}, + /*outputMap=*/{N, H, W, C, CM}}) + .matchBody(); +} + +// #inputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H + h, W + w, C)> +// #filterMap = affine_map<(N, H, W, C, CM, h, w) -> (h, w, C, CM)> +// #scalarMap = affine_map<(N, H, W, C, CM, h, w) -> ()> +// #outputMap = affine_map<(N, H, W, C, CM, h, w) -> (N, H, W, C, CM)> +template <> +bool isaConvolutionOpOfType( + LinalgOp op, SmallVector *dilations, + SmallVector *strides) { + if (isa(op)) + return true; + + assert(isaConvolutionOpInterface(op) && + "expected op to implement ConvolutionOpInterface"); + + ConvMatcherBuilder m(op, /*spatialRank=*/2, dilations, strides); + AffineExpr N = m.dim(0); + AffineExpr H = m.dim(1); + AffineExpr W = m.dim(2); + AffineExpr C = m.dim(3); + AffineExpr CM = m.dim(4); + AffineExpr h = m.dim(5); + AffineExpr w = m.dim(6); + + return m.matchStride(/*iDim=*/1, /*fDim=*/0, /*oDim=*/1, /*idx=*/0) + .matchStride(/*iDim=*/2, /*fDim=*/1, /*oDim=*/2, /*idx=*/1) + .matchMaps({/*inputMap=*/{N, m.strided(H, h, 0), m.strided(W, w, 1), C}, + /*filterMap=*/{h, w, C, CM}, + /*scalarMap=*/{}, + /*scalarMap=*/{}, + /*outputMap=*/{N, H, W, C, CM}}) + .matchBody(/*zeroPointOffset=*/true); +} + // #inputMap = affine_map<(N, D, H, W, CM, d, h, w, C) // -> (N, D + d, H + h, W + w, C)> // #filterMap = affine_map<(N, D, H, W, CM, d, h, w, C) diff --git a/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir b/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir index 4b2d42a3ae4e0..432fdd12f540d 100644 --- a/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir +++ b/mlir/test/Dialect/Linalg/convolution/roundtrip-convolution.mlir @@ -5,8 +5,9 @@ // RUN: mlir-opt %s -linalg-generalize-named-ops | mlir-opt --linalg-specialize-generic-ops | FileCheck %s --implicit-check-not=linalg.generic // ----------------------------- -// Convolution ops. +// Convolution ops - 1D. // ----------------------------- + func.func @conv_1d(%in : tensor, %filter : tensor, %out : tensor) -> tensor { %0 = linalg.conv_1d ins(%in, %filter : tensor, tensor) @@ -44,6 +45,10 @@ func.func @conv_1d_ncw_fcw(%input: tensor, %filter: tensor // ----- +// ----------------------------- +// Convolution ops - 2D. +// ----------------------------- + func.func @conv_2d(%in : tensor, %filter : tensor, %out : tensor) -> tensor { %0 = linalg.conv_2d ins(%in, %filter : tensor, tensor) @@ -55,6 +60,153 @@ func.func @conv_2d(%in : tensor, %filter : tensor, %out : tens // ----- +func.func @conv_2d_nhwc_hwcf(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.conv_2d_nhwc_hwcf + {dilations = dense<2> : tensor<2xi64>, strides = dense<3> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nhwc_hwcf +// CHECK: linalg.conv_2d_nhwc_hwcf +// CHECK-SAME: dilations = dense<2> : tensor<2xi64>, strides = dense<3> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nhwc_hwcf_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.conv_2d_nhwc_hwcf_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nhwc_hwcf_q +// CHECK: linalg.conv_2d_nhwc_hwcf_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nhwc_fhwc(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.conv_2d_nhwc_fhwc + {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nhwc_fhwc +// CHECK: linalg.conv_2d_nhwc_fhwc +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nhwc_fhwc_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.conv_2d_nhwc_fhwc_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nhwc_fhwc_q +// CHECK: linalg.conv_2d_nhwc_fhwc_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nchw_fchw(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.conv_2d_nchw_fchw + {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 4]> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nchw_fchw +// CHECK: linalg.conv_2d_nchw_fchw +// CHECK-SAME: dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 4]> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nchw_fchw_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.conv_2d_nchw_fchw_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nchw_fchw_q +// CHECK: linalg.conv_2d_nchw_fchw_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @conv_2d_ngchw_fgchw(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.conv_2d_ngchw_fgchw + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_ngchw_fgchw +// CHECK: linalg.conv_2d_ngchw_fgchw +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @conv_2d_ngchw_gfchw(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.conv_2d_ngchw_gfchw + {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_ngchw_gfchw +// CHECK: linalg.conv_2d_ngchw_gfchw +// CHECK-SAME: dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @conv_2d_ngchw_gfchw_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.conv_2d_ngchw_gfchw_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_ngchw_gfchw_q +// CHECK: linalg.conv_2d_ngchw_gfchw_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nhwgc_gfhwc(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.conv_2d_nhwgc_gfhwc + {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nhwgc_gfhwc +// CHECK: linalg.conv_2d_nhwgc_gfhwc +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64> + +// ----- + +func.func @conv_2d_nhwgc_gfhwc_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.conv_2d_nhwgc_gfhwc_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @conv_2d_nhwgc_gfhwc_q +// CHECK: linalg.conv_2d_nhwgc_gfhwc_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +// ----------------------------- +// Convolution ops - 3D. +// ----------------------------- + func.func @conv_3d(%in : tensor, %filter : tensor, %out : tensor) -> tensor { %0 = linalg.conv_3d ins(%in, %filter : tensor, tensor) @@ -66,9 +218,10 @@ func.func @conv_3d(%in : tensor, %filter : tensor, %out : // ----- -// ----------------------------- -// Depthwise Convolution ops. -// ----------------------------- +// ------------------------------- +// Depthwise Convolution ops - 1D. +// ------------------------------- + func.func @depthwise_conv_1d_ncw_cw(%input: tensor, %filter: tensor, %output: tensor) -> tensor { %0 = linalg.depthwise_conv_1d_ncw_cw {dilations = dense<3> : tensor<1xi64>, strides = dense<2> : tensor<1xi64>} @@ -108,6 +261,10 @@ func.func @depthwise_conv_1d_nwc_wcm(%input: tensor, %filter: tensor< // ----- +// ------------------------------- +// Depthwise Convolution ops - 2D. +// ------------------------------- + func.func @depthwise_conv_2d_nchw_chw(%input: tensor, %filter: tensor, %output: tensor) -> tensor { %0 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<[2,3]> : vector<2xi64>, strides = dense<[4,5]> : vector<2xi64>} @@ -121,6 +278,62 @@ func.func @depthwise_conv_2d_nchw_chw(%input: tensor, %filter: tens // ----- +func.func @depthwise_conv_2d_nhwc_hwc(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.depthwise_conv_2d_nhwc_hwc + {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @depthwise_conv_2d_nhwc_hwc +// CHECK: linalg.depthwise_conv_2d_nhwc_hwc +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64> + +// ----- + +func.func @depthwise_conv_2d_nhwc_hwc_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.depthwise_conv_2d_nhwc_hwc_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @depthwise_conv_2d_nhwc_hwc_q +// CHECK: linalg.depthwise_conv_2d_nhwc_hwc_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +func.func @depthwise_conv_2d_nhwc_hwcm(%input: tensor, %filter: tensor, %output: tensor) -> tensor { + %0 = linalg.depthwise_conv_2d_nhwc_hwcm + {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 1]> : tensor<2xi64>} + ins (%input, %filter: tensor, tensor) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @depthwise_conv_2d_nhwc_hwcm +// CHECK: linalg.depthwise_conv_2d_nhwc_hwcm +// CHECK-SAME: dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[3, 1]> : tensor<2xi64> + +// ----- + +func.func @depthwise_conv_2d_nhwc_hwcm_q(%input: tensor, %filter: tensor, %output: tensor, %zp_input: i32, %zp_filter: i32) -> tensor { + %0 = linalg.depthwise_conv_2d_nhwc_hwcm_q + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} + ins (%input, %filter, %zp_input, %zp_filter : tensor, tensor, i32, i32) + outs (%output: tensor) -> tensor + return %0 : tensor +} +// CHECK: @depthwise_conv_2d_nhwc_hwcm_q +// CHECK: linalg.depthwise_conv_2d_nhwc_hwcm_q +// CHECK-SAME: dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> + +// ----- + +// ------------------------------- +// Depthwise Convolution ops - 3D. +// ------------------------------- + func.func @depthwise_conv_3d_ndhwc_dhwcm(%input: tensor, %filter: tensor, %output: tensor) -> tensor { %0 = linalg.depthwise_conv_3d_ndhwc_dhwcm {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} @@ -137,6 +350,7 @@ func.func @depthwise_conv_3d_ndhwc_dhwcm(%input: tensor, %filter: // ----------------------------- // Pooling ops. // ----------------------------- + func.func @pooling_nhwc_max(%input: tensor, %filter: tensor, %output: tensor) -> tensor { %0 = linalg.pooling_nhwc_max {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} From 2ce17ba3470bfc819ff6a2e5f98f5f7ed9bb38a2 Mon Sep 17 00:00:00 2001 From: Tirthankar Mazumder Date: Thu, 11 Dec 2025 13:10:04 +0530 Subject: [PATCH 24/49] [InstCombine][CmpInstAnalysis] Use consistent spelling and function names. NFC. (#171645) Both `decomposeBitTestICmp` and `decomposeBitTest` have a parameter called `lookThroughTrunc`. This was spelled in full (i.e. `lookThroughTrunc`) in the header. However, in the implementation, it's written as `lookThruTrunc`. I opted to convert all instances of `lookThruTrunc` into `lookThroughTrunc` to reduce surprise while reading the code and for conformity. --- The other change in this PR is the renaming of the wrapper around `decomposeBitTest()`. Even though it was a wrapper around `CmpInstAnalysis.h`'s `decomposeBitTest`, the function was called `decomposeBitTestICmp`. This is quite confusing because such a function _also_ exists in `CmpInstAnalysis.h`, but it is _not_ the one actually being used in `InstCombineAndOrXor.cpp`. --- llvm/lib/Analysis/CmpInstAnalysis.cpp | 8 ++++---- .../Transforms/InstCombine/InstCombineAndOrXor.cpp | 12 ++++++------ .../Transforms/InstCombine/InstCombineCompares.cpp | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Analysis/CmpInstAnalysis.cpp b/llvm/lib/Analysis/CmpInstAnalysis.cpp index a1a79e5685f80..a6d0d3ff4fcd4 100644 --- a/llvm/lib/Analysis/CmpInstAnalysis.cpp +++ b/llvm/lib/Analysis/CmpInstAnalysis.cpp @@ -75,7 +75,7 @@ Constant *llvm::getPredForFCmpCode(unsigned Code, Type *OpTy, std::optional llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, - bool LookThruTrunc, bool AllowNonZeroC, + bool LookThroughTrunc, bool AllowNonZeroC, bool DecomposeAnd) { using namespace PatternMatch; @@ -173,7 +173,7 @@ llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, Result.Pred = ICmpInst::getInversePredicate(Result.Pred); Value *X; - if (LookThruTrunc && match(LHS, m_Trunc(m_Value(X)))) { + if (LookThroughTrunc && match(LHS, m_Trunc(m_Value(X)))) { Result.X = X; Result.Mask = Result.Mask.zext(X->getType()->getScalarSizeInBits()); Result.C = Result.C.zext(X->getType()->getScalarSizeInBits()); @@ -185,7 +185,7 @@ llvm::decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, } std::optional llvm::decomposeBitTest(Value *Cond, - bool LookThruTrunc, + bool LookThroughTrunc, bool AllowNonZeroC, bool DecomposeAnd) { using namespace PatternMatch; @@ -194,7 +194,7 @@ std::optional llvm::decomposeBitTest(Value *Cond, if (!ICmp->getOperand(0)->getType()->isIntOrIntVectorTy()) return std::nullopt; return decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1), - ICmp->getPredicate(), LookThruTrunc, + ICmp->getPredicate(), LookThroughTrunc, AllowNonZeroC, DecomposeAnd); } Value *X; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ba5568b00441b..9cf382f8020fa 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -186,9 +186,9 @@ static unsigned conjugateICmpMask(unsigned Mask) { return NewMask; } -// Adapts the external decomposeBitTestICmp for local use. -static bool decomposeBitTestICmp(Value *Cond, CmpInst::Predicate &Pred, - Value *&X, Value *&Y, Value *&Z) { +// Adapts the external decomposeBitTest for local use. +static bool decomposeBitTest(Value *Cond, CmpInst::Predicate &Pred, Value *&X, + Value *&Y, Value *&Z) { auto Res = llvm::decomposeBitTest(Cond, /*LookThroughTrunc=*/true, /*AllowNonZeroC=*/true); if (!Res) @@ -220,7 +220,7 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, Value *&D, Value *&E, // Check whether the icmp can be decomposed into a bit test. Value *L1, *L11, *L12, *L2, *L21, *L22; - if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) { + if (decomposeBitTest(LHS, PredL, L11, L12, L2)) { L21 = L22 = L1 = nullptr; } else { auto *LHSCMP = dyn_cast(LHS); @@ -253,7 +253,7 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, Value *&D, Value *&E, return std::nullopt; Value *R11, *R12, *R2; - if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) { + if (decomposeBitTest(RHS, PredR, R11, R12, R2)) { if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { A = R11; D = R12; @@ -3890,7 +3890,7 @@ static std::optional matchBitmaskMul(Value *V) { // Decompose ((A & N) ? 0 : N * C) into BitMaskMul if (match(Op, m_Select(m_Value(Cond), m_APInt(EqZero), m_APInt(NeZero)))) { auto ICmpDecompose = - decomposeBitTest(Cond, /*LookThruTrunc=*/true, + decomposeBitTest(Cond, /*LookThroughTrunc=*/true, /*AllowNonZeroC=*/false, /*DecomposeBitMask=*/true); if (!ICmpDecompose.has_value()) return std::nullopt; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index abf4381ebd794..1859dad4ec00b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -6290,7 +6290,7 @@ Instruction *InstCombinerImpl::foldICmpWithTrunc(ICmpInst &ICmp) { // This matches patterns corresponding to tests of the signbit as well as: // (trunc X) pred C2 --> (X & Mask) == C - if (auto Res = decomposeBitTestICmp(Op0, Op1, Pred, /*WithTrunc=*/true, + if (auto Res = decomposeBitTestICmp(Op0, Op1, Pred, /*LookThroughTrunc=*/true, /*AllowNonZeroC=*/true)) { Value *And = Builder.CreateAnd(Res->X, Res->Mask); Constant *C = ConstantInt::get(Res->X->getType(), Res->C); From aa31efcee107112c84af7993e9500849e7584f33 Mon Sep 17 00:00:00 2001 From: Romaric Jodin Date: Thu, 11 Dec 2025 08:41:55 +0100 Subject: [PATCH 25/49] [libclc] use clc functions in clspv/shared/vstore_half.cl (#171770) --- libclc/opencl/lib/clspv/shared/vstore_half.cl | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/libclc/opencl/lib/clspv/shared/vstore_half.cl b/libclc/opencl/lib/clspv/shared/vstore_half.cl index 341ec3e251719..cfcbf55caeae7 100644 --- a/libclc/opencl/lib/clspv/shared/vstore_half.cl +++ b/libclc/opencl/lib/clspv/shared/vstore_half.cl @@ -8,7 +8,13 @@ #include #include +#include +#include +#include #include +#include +#include +#include #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable @@ -48,32 +54,32 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { /* Handle nan corner case */ - if (isnan(x)) + if (__clc_isnan(x)) return x; /* RTZ does not produce Inf for large numbers */ - if (fabs(x) > 65504.0f && !isinf(x)) - return copysign(65504.0f, x); + if (__clc_fabs(x) > 65504.0f && !__clc_isinf(x)) + return __clc_copysign(65504.0f, x); const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; /* Manage range rounded to +- zero explicitely */ if (exp < -24) - return copysign(0.0f, x); + return __clc_copysign(0.0f, x); /* Remove lower 13 bits to make sure the number is rounded down */ int mask = 0xffffe000; /* Denormals cannot be flushed, and they use different bit for rounding */ if (exp < -14) - mask <<= min(-(exp + 14), 10); + mask <<= __clc_min(-(exp + 14), 10); return __clc_as_float(__clc_as_uint(x) & mask); } _CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { /* Handle nan corner case */ - if (isnan(x)) + if (__clc_isnan(x)) return x; - const float inf = copysign(INFINITY, x); + const float inf = __clc_copysign(INFINITY, x); uint ux = __clc_as_uint(x); /* Manage +- infinity explicitely */ @@ -82,23 +88,23 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { } /* Manage +- zero explicitely */ if ((ux & 0x7fffffff) == 0) { - return copysign(0.0f, x); + return __clc_copysign(0.0f, x); } const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; /* Manage range rounded to smallest half denormal explicitely */ if (exp < -24) { - return copysign(0x1.0p-24f, x); + return __clc_copysign(0x1.0p-24f, x); } /* Set lower 13 bits */ int mask = (1 << 13) - 1; /* Denormals cannot be flushed, and they use different bit for rounding */ if (exp < -14) { - mask = (1 << (13 + min(-(exp + 14), 10))) - 1; + mask = (1 << (13 + __clc_min(-(exp + 14), 10))) - 1; } - const float next = nextafter(__clc_as_float(ux | mask), inf); + const float next = __clc_nextafter(__clc_as_float(ux | mask), inf); return ((ux & mask) == 0) ? __clc_as_float(ux) : next; } _CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { @@ -116,7 +122,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { /* The default assumes lower 13 bits are rounded, * but it might be more for denormals. * Shifting beyond last == 0b, and qr == 00b is not necessary */ - shift += min(-(exp + 14), 15); + shift += __clc_min(-(exp + 14), 15); } int mask = (1 << shift) - 1; const uint grs = mantissa & mask; From b7c0452a9a3d895b14ec7c5735e4e4ddc311edb3 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 11 Dec 2025 08:57:26 +0100 Subject: [PATCH 26/49] [PowerPC][AIX] Specify correct ABI alignment for double (#144673) Add `f64:32:64` to the data layout for AIX, to indicate that doubles have a 32-bit ABI alignment and 64-bit preferred alignment. Clang was already taking this into account, but it was not reflected in LLVM's data layout. A notable effect of this change is that `double` loads/stores with 4 byte alignment are no longer considered "unaligned" and avoid the corresponding unaligned access legalization. I assume that this is correct/desired for AIX. (The codegen previously already relied on this in some places related to the call ABI simply by dint of assuming certain stack locations were 8 byte aligned, even though they were only actually 4 byte aligned.) Fixes https://github.com/llvm/llvm-project/issues/133599. --- clang/lib/CodeGen/CodeGenModule.cpp | 14 +- .../test/CodeGen/2007-05-07-PaddingElements.c | 7 +- llvm/lib/IR/AutoUpgrade.cpp | 8 +- llvm/lib/TargetParser/TargetDataLayout.cpp | 4 + llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll | 152 +++++++-------- llvm/test/CodeGen/PowerPC/aix-cc-abi.ll | 174 ++++++++---------- .../PowerPC/aix-emit-tracebacktable.ll | 2 +- llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll | 4 +- .../CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll | 61 +++--- .../CodeGen/PowerPC/aix32-cc-abi-vaarg.ll | 57 +++--- .../CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll | 6 +- .../Bitcode/DataLayoutUpgradeTest.cpp | 12 +- 12 files changed, 235 insertions(+), 266 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 1dcf94fc35e07..c061575af411c 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -378,15 +378,11 @@ static void checkDataLayoutConsistency(const TargetInfo &Target, Check("bfloat", llvm::Type::getBFloatTy(Context), Target.BFloat16Align); Check("float", llvm::Type::getFloatingPointTy(Context, *Target.FloatFormat), Target.FloatAlign); - // FIXME: AIX specifies wrong double alignment in DataLayout - if (!Triple.isOSAIX()) { - Check("double", - llvm::Type::getFloatingPointTy(Context, *Target.DoubleFormat), - Target.DoubleAlign); - Check("long double", - llvm::Type::getFloatingPointTy(Context, *Target.LongDoubleFormat), - Target.LongDoubleAlign); - } + Check("double", llvm::Type::getFloatingPointTy(Context, *Target.DoubleFormat), + Target.DoubleAlign); + Check("long double", + llvm::Type::getFloatingPointTy(Context, *Target.LongDoubleFormat), + Target.LongDoubleAlign); if (Target.hasFloat128Type()) Check("__float128", llvm::Type::getFP128Ty(Context), Target.Float128Align); if (Target.hasIbm128Type()) diff --git a/clang/test/CodeGen/2007-05-07-PaddingElements.c b/clang/test/CodeGen/2007-05-07-PaddingElements.c index f8ec2483a8d61..28d24800abbe6 100644 --- a/clang/test/CodeGen/2007-05-07-PaddingElements.c +++ b/clang/test/CodeGen/2007-05-07-PaddingElements.c @@ -1,6 +1,9 @@ // PR 1278 -// RUN: %clang_cc1 %s -emit-llvm -o - | grep struct.s | not grep "4 x i8] zeroinitializer" -// RUN: %clang_cc1 %s -emit-llvm -o - | not grep "i32 0, i32 2" +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple powerpc64-ibm-aix %s -emit-llvm -o - | FileCheck %s --check-prefix=AIX + +// CHECK: %struct.s = type { double, i32 } +// AIX: %struct.s = type { double, i32, [4 x i8] } struct s { double d1; int s1; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 2202b08e3cf0d..5efede4f87680 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -6468,7 +6468,13 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { if (Pos != size_t(-1)) Res.insert(Pos + I64.size(), I128); } - return Res; + } + + if (T.isPPC() && T.isOSAIX() && !DL.contains("f64:32:64") && !DL.empty()) { + size_t Pos = Res.find("-S128"); + if (Pos == StringRef::npos) + Pos = Res.size(); + Res.insert(Pos, "-f64:32:64"); } if (!T.isX86()) diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index cbcbb5e40fdfa..981c5561211db 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -247,6 +247,10 @@ static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) { else Ret += "-n32"; + // The ABI alignment for doubles on AIX is 4 bytes. + if (T.isOSAIX()) + Ret += "-f64:32:64"; + // Specify the vector alignment explicitly. For v256i1 and v512i1, the // calculated alignment would be 256*alignment(i1) and 512*alignment(i1), // which is 256 and 512 bytes - way over aligned. diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll index 258ddf60088c1..02994811dc8af 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll @@ -636,7 +636,7 @@ define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) { ; 32BIT-NEXT: renamable $f0 = nofpexcept FADDS killed renamable $f0, killed renamable $f1, implicit $rm ; 32BIT-NEXT: renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm ; 32BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) - ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4) ; 32BIT-NEXT: BLR implicit $lr, implicit $rm, implicit $r3 ; ; 64BIT-LABEL: name: test_mix @@ -655,7 +655,7 @@ define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) { ; 64BIT-NEXT: renamable $f0 = nofpexcept FADDS killed renamable $f0, killed renamable $f1, implicit $rm ; 64BIT-NEXT: renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm ; 64BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) - ; 64BIT-NEXT: renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8) + ; 64BIT-NEXT: renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4) ; 64BIT-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3 entry: %conv = fpext float %f to double @@ -956,11 +956,7 @@ define void @call_test_stackarg_float() { ; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f) ; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r4 :: (dereferenceable load (s64) from @d) ; 32BIT-NEXT: ADJCALLSTACKDOWN 68, 0, implicit-def dead $r1, implicit $r1 - ; 32BIT-NEXT: STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0) - ; 32BIT-NEXT: STFS renamable $f1, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4) - ; 32BIT-NEXT: STW killed renamable $r3, 64, $r1 :: (store (s32) into stack + 64, align 16) - ; 32BIT-NEXT: renamable $r11 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8) + ; 32BIT-NEXT: STFD renamable $f2, 60, $r1 :: (store (s64) into stack + 60, align 4, basealign 16) ; 32BIT-NEXT: $r3 = LI 1 ; 32BIT-NEXT: $r4 = LI 2 ; 32BIT-NEXT: $r5 = LI 3 @@ -969,8 +965,8 @@ define void @call_test_stackarg_float() { ; 32BIT-NEXT: $r8 = LI 6 ; 32BIT-NEXT: $r9 = LI 7 ; 32BIT-NEXT: $r10 = LI 8 - ; 32BIT-NEXT: STW killed renamable $r11, 60, $r1 :: (store (s32) into stack + 60, basealign 16) - ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1 + ; 32BIT-NEXT: STFS renamable $f1, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16) + ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1 ; 32BIT-NEXT: ADJCALLSTACKUP 68, 0, implicit-def dead $r1, implicit $r1 ; 32BIT-NEXT: BLR implicit $lr, implicit $rm ; @@ -1057,11 +1053,7 @@ define void @call_test_stackarg_float3() { ; 32BIT-NEXT: renamable $r10 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8) ; 32BIT-NEXT: renamable $f2 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f) ; 32BIT-NEXT: ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1 - ; 32BIT-NEXT: STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1) ; 32BIT-NEXT: STFS renamable $f2, 60, $r1 :: (store (s32) into stack + 60, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.1 :: (load (s32) from %stack.1 + 4) - ; 32BIT-NEXT: STW killed renamable $r3, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16) - ; 32BIT-NEXT: renamable $r11 = LWZ 0, %stack.1 :: (load (s32) from %stack.1, align 8) ; 32BIT-NEXT: $r3 = LI 1 ; 32BIT-NEXT: $r4 = LI 2 ; 32BIT-NEXT: $r5 = LI 3 @@ -1069,8 +1061,8 @@ define void @call_test_stackarg_float3() { ; 32BIT-NEXT: $r7 = LI 5 ; 32BIT-NEXT: $r8 = LI 6 ; 32BIT-NEXT: $r9 = LI 7 - ; 32BIT-NEXT: STW killed renamable $r11, 52, $r1 :: (store (s32) into stack + 52, basealign 16) - ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1 + ; 32BIT-NEXT: STFD renamable $f1, 52, $r1 :: (store (s64) into stack + 52, align 4, basealign 16) + ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1 ; 32BIT-NEXT: ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1 ; 32BIT-NEXT: BLR implicit $lr, implicit $rm ; @@ -1372,7 +1364,7 @@ define double @test_fpr_stack(double %d1, double %d2, double %d3, double %d4, do ; 32BIT: bb.0.entry: ; 32BIT-NEXT: liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13 ; 32BIT-NEXT: {{ $}} - ; 32BIT-NEXT: renamable $f0 = LFD 0, %fixed-stack.1 :: (load (s64) from %fixed-stack.1) + ; 32BIT-NEXT: renamable $f0 = LFD 0, %fixed-stack.1 :: (load (s64) from %fixed-stack.1, align 4) ; 32BIT-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm ; 32BIT-NEXT: renamable $f2 = LFS 0, %fixed-stack.2 :: (load (s32) from %fixed-stack.2, align 16) ; 32BIT-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm @@ -1449,92 +1441,88 @@ define void @caller_fpr_stack() { ; 32BIT-NEXT: renamable $r3 = LWZtoc @d15, $r2 :: (load (s32) from got) ; 32BIT-NEXT: renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got) ; 32BIT-NEXT: renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15) - ; 32BIT-NEXT: renamable $r3 = LWZtoc @f16, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $r4 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14) - ; 32BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $r3 :: (dereferenceable load (s32) from @f16) + ; 32BIT-NEXT: renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14) + ; 32BIT-NEXT: renamable $r4 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @f16) ; 32BIT-NEXT: ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1 - ; 32BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) ; 32BIT-NEXT: renamable $r5 = LI 0 ; 32BIT-NEXT: renamable $r6 = LIS 16352 - ; 32BIT-NEXT: STW killed renamable $r3, 140, $r1 :: (store (s32) into stack + 140, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LIS 13107 - ; 32BIT-NEXT: STW killed renamable $r4, 128, $r1 :: (store (s32) into stack + 128, align 16) - ; 32BIT-NEXT: renamable $r4 = LIS 16355 ; 32BIT-NEXT: STW killed renamable $r5, 60, $r1 :: (store (s32) into stack + 60, basealign 16) - ; 32BIT-NEXT: renamable $r5 = LIS 26214 + ; 32BIT-NEXT: renamable $r5 = LIS 13107 ; 32BIT-NEXT: STW killed renamable $r6, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16) + ; 32BIT-NEXT: renamable $r6 = LIS 16355 + ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 13107 + ; 32BIT-NEXT: STW killed renamable $r5, 68, $r1 :: (store (s32) into stack + 68, basealign 16) + ; 32BIT-NEXT: renamable $r5 = LIS 26214 + ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 13107 + ; 32BIT-NEXT: STW killed renamable $r6, 64, $r1 :: (store (s32) into stack + 64, align 16) ; 32BIT-NEXT: renamable $r6 = LIS 16358 - ; 32BIT-NEXT: renamable $r3 = ORI killed renamable $r3, 13107 - ; 32BIT-NEXT: STW killed renamable $r3, 68, $r1 :: (store (s32) into stack + 68, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LIS 39321 - ; 32BIT-NEXT: renamable $r4 = ORI killed renamable $r4, 13107 - ; 32BIT-NEXT: STW killed renamable $r4, 64, $r1 :: (store (s32) into stack + 64, align 16) - ; 32BIT-NEXT: renamable $r4 = LIS 16361 ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 26214 ; 32BIT-NEXT: STW killed renamable $r5, 76, $r1 :: (store (s32) into stack + 76, basealign 16) - ; 32BIT-NEXT: renamable $r5 = LIS 52428 + ; 32BIT-NEXT: renamable $r5 = LIS 39321 ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 26214 ; 32BIT-NEXT: STW killed renamable $r6, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16) + ; 32BIT-NEXT: renamable $r6 = LIS 16361 + ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 39321 + ; 32BIT-NEXT: STW killed renamable $r6, 80, $r1 :: (store (s32) into stack + 80, align 16) + ; 32BIT-NEXT: renamable $r6 = LIS 52428 + ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 52429 + ; 32BIT-NEXT: STW killed renamable $r6, 92, $r1 :: (store (s32) into stack + 92, basealign 16) ; 32BIT-NEXT: renamable $r6 = LIS 16364 - ; 32BIT-NEXT: renamable $r4 = ORI killed renamable $r4, 39321 - ; 32BIT-NEXT: STW killed renamable $r4, 80, $r1 :: (store (s32) into stack + 80, align 16) - ; 32BIT-NEXT: renamable $r4 = LIS 16313 - ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 52429 - ; 32BIT-NEXT: STW killed renamable $r5, 92, $r1 :: (store (s32) into stack + 92, basealign 16) - ; 32BIT-NEXT: renamable $r5 = LIS 49807 - ; 32BIT-NEXT: renamable $r3 = ORI killed renamable $r3, 39322 - ; 32BIT-NEXT: STW renamable $r3, 84, $r1 :: (store (s32) into stack + 84, basealign 16) + ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 39322 + ; 32BIT-NEXT: STW renamable $r5, 84, $r1 :: (store (s32) into stack + 84, basealign 16) ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 52428 ; 32BIT-NEXT: STW killed renamable $r6, 88, $r1 :: (store (s32) into stack + 88, align 8, basealign 16) + ; 32BIT-NEXT: renamable $r6 = LIS 16313 + ; 32BIT-NEXT: STW killed renamable $r5, 100, $r1 :: (store (s32) into stack + 100, basealign 16) + ; 32BIT-NEXT: renamable $r5 = LIS 49807 + ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 39321 + ; 32BIT-NEXT: STW killed renamable $r6, 96, $r1 :: (store (s32) into stack + 96, align 16) ; 32BIT-NEXT: renamable $r6 = LIS 16316 - ; 32BIT-NEXT: STW killed renamable $r3, 100, $r1 :: (store (s32) into stack + 100, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LIS 60293 - ; 32BIT-NEXT: renamable $r4 = ORI killed renamable $r4, 39321 - ; 32BIT-NEXT: STW killed renamable $r4, 96, $r1 :: (store (s32) into stack + 96, align 16) - ; 32BIT-NEXT: renamable $r4 = LIS 16318 ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 23593 ; 32BIT-NEXT: STW killed renamable $r5, 108, $r1 :: (store (s32) into stack + 108, basealign 16) - ; 32BIT-NEXT: renamable $r5 = LIS 2621 + ; 32BIT-NEXT: renamable $r5 = LIS 60293 ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 10485 ; 32BIT-NEXT: STW killed renamable $r6, 104, $r1 :: (store (s32) into stack + 104, align 8, basealign 16) + ; 32BIT-NEXT: renamable $r6 = LIS 16318 + ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 7864 + ; 32BIT-NEXT: STW killed renamable $r5, 116, $r1 :: (store (s32) into stack + 116, basealign 16) + ; 32BIT-NEXT: renamable $r5 = LIS 2621 + ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 47185 + ; 32BIT-NEXT: STW killed renamable $r6, 112, $r1 :: (store (s32) into stack + 112, align 16) ; 32BIT-NEXT: renamable $r6 = LIS 16320 - ; 32BIT-NEXT: renamable $r3 = ORI killed renamable $r3, 7864 - ; 32BIT-NEXT: STW killed renamable $r3, 116, $r1 :: (store (s32) into stack + 116, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LWZtoc %const.0, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $r4 = ORI killed renamable $r4, 47185 - ; 32BIT-NEXT: STW killed renamable $r4, 112, $r1 :: (store (s32) into stack + 112, align 16) - ; 32BIT-NEXT: renamable $r4 = ORI killed renamable $r5, 28836 - ; 32BIT-NEXT: STW killed renamable $r4, 124, $r1 :: (store (s32) into stack + 124, basealign 16) - ; 32BIT-NEXT: renamable $r4 = ORI killed renamable $r6, 41943 - ; 32BIT-NEXT: STW killed renamable $r4, 120, $r1 :: (store (s32) into stack + 120, align 8, basealign 16) - ; 32BIT-NEXT: renamable $r4 = LWZtoc %const.1, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $r5 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4) - ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.2, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r3 = LWZtoc %const.3, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f3 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r4 = LWZtoc %const.4, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f4 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r5 = ORI killed renamable $r5, 28836 + ; 32BIT-NEXT: STW killed renamable $r5, 124, $r1 :: (store (s32) into stack + 124, basealign 16) + ; 32BIT-NEXT: renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $r6 = ORI killed renamable $r6, 41943 + ; 32BIT-NEXT: STW killed renamable $r6, 120, $r1 :: (store (s32) into stack + 120, align 8, basealign 16) + ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.1, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r5 = LWZtoc %const.2, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f3 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.3, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f4 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r5 = LWZtoc %const.4, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f6 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.5, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f6 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r3 = LWZtoc %const.6, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f7 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r4 = LWZtoc %const.7, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f7 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r5 = LWZtoc %const.6, $r2 :: (load (s32) from got) ; 32BIT-NEXT: renamable $f8 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.8, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f9 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r3 = LWZtoc %const.9, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f1 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $r4 = LWZtoc %const.10, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f11 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.7, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f9 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r5 = LWZtoc %const.8, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f1 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.9, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f11 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $r5 = LWZtoc %const.10, $r2 :: (load (s32) from got) + ; 32BIT-NEXT: renamable $f12 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool) ; 32BIT-NEXT: renamable $r6 = LWZtoc %const.11, $r2 :: (load (s32) from got) - ; 32BIT-NEXT: renamable $f12 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool) - ; 32BIT-NEXT: renamable $f13 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool) + ; 32BIT-NEXT: renamable $f13 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool) ; 32BIT-NEXT: renamable $f5 = LFS 0, killed renamable $r6 :: (load (s32) from constant-pool) - ; 32BIT-NEXT: STW killed renamable $r5, 136, $r1 :: (store (s32) into stack + 136, align 8, basealign 16) - ; 32BIT-NEXT: renamable $r3 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8) + ; 32BIT-NEXT: STW killed renamable $r4, 140, $r1 :: (store (s32) into stack + 140, basealign 16) + ; 32BIT-NEXT: STFD killed renamable $f0, 132, $r1 :: (store (s64) into stack + 132, align 4, basealign 16) ; 32BIT-NEXT: $f10 = COPY renamable $f1 - ; 32BIT-NEXT: STW killed renamable $r3, 132, $r1 :: (store (s32) into stack + 132, basealign 16) + ; 32BIT-NEXT: STW killed renamable $r3, 128, $r1 :: (store (s32) into stack + 128, align 16) ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1 ; 32BIT-NEXT: ADJCALLSTACKUP 144, 0, implicit-def dead $r1, implicit $r1 ; 32BIT-NEXT: BLR implicit $lr, implicit $rm @@ -1647,7 +1635,7 @@ define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroex ; 32BIT-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; 32BIT-NEXT: renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm ; 32BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) - ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4) ; 32BIT-NEXT: BLR implicit $lr, implicit $rm, implicit $r3 ; ; 64BIT-LABEL: name: mix_callee @@ -1671,7 +1659,7 @@ define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroex ; 64BIT-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; 64BIT-NEXT: renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm ; 64BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) - ; 64BIT-NEXT: renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8) + ; 64BIT-NEXT: renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4) ; 64BIT-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3 entry: %add = fadd double %d1, %d2 @@ -1791,7 +1779,7 @@ define void @caller_mix() { ; 32BIT-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f30, implicit $rm ; 32BIT-NEXT: renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm ; 32BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) - ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4) ; 32BIT-NEXT: BLR implicit $lr, implicit $rm, implicit $r3 ; ; 64BIT-LABEL: name: mix_floats @@ -1826,7 +1814,7 @@ define void @caller_mix() { ; 64BIT-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm ; 64BIT-NEXT: renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm ; 64BIT-NEXT: STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0) - ; 64BIT-NEXT: renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8) + ; 64BIT-NEXT: renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4) ; 64BIT-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3 entry: %add = add nsw i32 %i1, %i2 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll index 03770d22d9f4f..5ed0dfb258f73 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll @@ -1012,22 +1012,18 @@ define void @call_test_stackarg_float() { ; ASM32PWR4-NEXT: lwz 3, L..C8(2) # @f ; ASM32PWR4-NEXT: stw 0, 88(1) ; ASM32PWR4-NEXT: li 4, 2 +; ASM32PWR4-NEXT: li 5, 3 ; ASM32PWR4-NEXT: li 6, 4 ; ASM32PWR4-NEXT: li 7, 5 -; ASM32PWR4-NEXT: li 8, 6 ; ASM32PWR4-NEXT: lfs 1, 0(3) ; ASM32PWR4-NEXT: lwz 3, L..C9(2) # @d +; ASM32PWR4-NEXT: li 8, 6 ; ASM32PWR4-NEXT: li 9, 7 -; ASM32PWR4-NEXT: li 10, 8 ; ASM32PWR4-NEXT: lfd 2, 0(3) ; ASM32PWR4-NEXT: li 3, 1 -; ASM32PWR4-NEXT: stfd 2, 72(1) -; ASM32PWR4-NEXT: lwz 5, 76(1) -; ASM32PWR4-NEXT: lwz 11, 72(1) -; ASM32PWR4-NEXT: stw 5, 64(1) -; ASM32PWR4-NEXT: li 5, 3 +; ASM32PWR4-NEXT: li 10, 8 +; ASM32PWR4-NEXT: stfd 2, 60(1) ; ASM32PWR4-NEXT: stfs 1, 56(1) -; ASM32PWR4-NEXT: stw 11, 60(1) ; ASM32PWR4-NEXT: bl .test_stackarg_float[PR] ; ASM32PWR4-NEXT: nop ; ASM32PWR4-NEXT: addi 1, 1, 80 @@ -1130,24 +1126,20 @@ define void @call_test_stackarg_float3() { ; ASM32PWR4-NEXT: stwu 1, -80(1) ; ASM32PWR4-NEXT: lwz 3, L..C9(2) # @d ; ASM32PWR4-NEXT: stw 0, 88(1) +; ASM32PWR4-NEXT: li 4, 2 ; ASM32PWR4-NEXT: li 5, 3 ; ASM32PWR4-NEXT: li 6, 4 ; ASM32PWR4-NEXT: li 7, 5 -; ASM32PWR4-NEXT: li 8, 6 ; ASM32PWR4-NEXT: lfd 1, 0(3) ; ASM32PWR4-NEXT: lwz 3, L..C8(2) # @f +; ASM32PWR4-NEXT: li 8, 6 ; ASM32PWR4-NEXT: li 9, 7 ; ASM32PWR4-NEXT: stfd 1, 72(1) +; ASM32PWR4-NEXT: lwz 10, 72(1) ; ASM32PWR4-NEXT: lfs 2, 0(3) ; ASM32PWR4-NEXT: li 3, 1 -; ASM32PWR4-NEXT: stfd 1, 64(1) -; ASM32PWR4-NEXT: lwz 4, 68(1) -; ASM32PWR4-NEXT: lwz 10, 72(1) -; ASM32PWR4-NEXT: lwz 11, 64(1) -; ASM32PWR4-NEXT: stw 4, 56(1) -; ASM32PWR4-NEXT: li 4, 2 ; ASM32PWR4-NEXT: stfs 2, 60(1) -; ASM32PWR4-NEXT: stw 11, 52(1) +; ASM32PWR4-NEXT: stfd 1, 52(1) ; ASM32PWR4-NEXT: bl .test_stackarg_float3[PR] ; ASM32PWR4-NEXT: nop ; ASM32PWR4-NEXT: addi 1, 1, 80 @@ -1570,99 +1562,95 @@ define void @caller_fpr_stack() { ; ASM32PWR4-LABEL: caller_fpr_stack: ; ASM32PWR4: # %bb.0: # %entry ; ASM32PWR4-NEXT: mflr 0 -; ASM32PWR4-NEXT: stwu 1, -160(1) +; ASM32PWR4-NEXT: stwu 1, -144(1) ; ASM32PWR4-NEXT: lwz 3, L..C19(2) # @d15 -; ASM32PWR4-NEXT: stw 0, 168(1) -; ASM32PWR4-NEXT: lwz 5, L..C20(2) # %const.1 -; ASM32PWR4-NEXT: lwz 4, L..C21(2) # @f14 +; ASM32PWR4-NEXT: lwz 4, L..C20(2) # @f14 +; ASM32PWR4-NEXT: lwz 5, L..C21(2) # @f16 +; ASM32PWR4-NEXT: stw 0, 152(1) +; ASM32PWR4-NEXT: lis 6, 16361 +; ASM32PWR4-NEXT: ori 6, 6, 39321 ; ASM32PWR4-NEXT: lfd 0, 0(3) -; ASM32PWR4-NEXT: lwz 3, L..C22(2) # @f16 -; ASM32PWR4-NEXT: lwz 3, 0(3) -; ASM32PWR4-NEXT: stw 3, 140(1) -; ASM32PWR4-NEXT: li 3, 0 -; ASM32PWR4-NEXT: stw 3, 60(1) -; ASM32PWR4-NEXT: lis 3, 16352 -; ASM32PWR4-NEXT: stw 3, 56(1) -; ASM32PWR4-NEXT: lis 3, 13107 -; ASM32PWR4-NEXT: ori 3, 3, 13107 -; ASM32PWR4-NEXT: stw 3, 68(1) -; ASM32PWR4-NEXT: lis 3, 16355 -; ASM32PWR4-NEXT: ori 3, 3, 13107 -; ASM32PWR4-NEXT: stw 3, 64(1) -; ASM32PWR4-NEXT: lis 3, 26214 -; ASM32PWR4-NEXT: ori 3, 3, 26214 -; ASM32PWR4-NEXT: stw 3, 76(1) -; ASM32PWR4-NEXT: lis 3, 16358 -; ASM32PWR4-NEXT: ori 3, 3, 26214 -; ASM32PWR4-NEXT: stw 3, 72(1) -; ASM32PWR4-NEXT: lis 3, -26215 -; ASM32PWR4-NEXT: ori 3, 3, 39322 -; ASM32PWR4-NEXT: stw 3, 84(1) -; ASM32PWR4-NEXT: stw 3, 100(1) -; ASM32PWR4-NEXT: lis 3, 16313 -; ASM32PWR4-NEXT: ori 3, 3, 39321 -; ASM32PWR4-NEXT: stw 3, 96(1) -; ASM32PWR4-NEXT: lis 3, -15729 -; ASM32PWR4-NEXT: ori 3, 3, 23593 -; ASM32PWR4-NEXT: stw 3, 108(1) -; ASM32PWR4-NEXT: lis 3, 16316 -; ASM32PWR4-NEXT: ori 3, 3, 10485 -; ASM32PWR4-NEXT: stw 3, 104(1) -; ASM32PWR4-NEXT: lis 3, -5243 -; ASM32PWR4-NEXT: ori 3, 3, 7864 -; ASM32PWR4-NEXT: stw 3, 116(1) -; ASM32PWR4-NEXT: lis 3, 16318 -; ASM32PWR4-NEXT: ori 3, 3, 47185 -; ASM32PWR4-NEXT: stw 3, 112(1) -; ASM32PWR4-NEXT: lis 3, 2621 -; ASM32PWR4-NEXT: ori 3, 3, 28836 -; ASM32PWR4-NEXT: stw 3, 124(1) -; ASM32PWR4-NEXT: lis 3, 16320 -; ASM32PWR4-NEXT: ori 3, 3, 41943 -; ASM32PWR4-NEXT: stw 3, 120(1) -; ASM32PWR4-NEXT: lwz 3, L..C23(2) # %const.0 -; ASM32PWR4-NEXT: lfd 2, 0(3) -; ASM32PWR4-NEXT: lwz 3, L..C24(2) # %const.2 +; ASM32PWR4-NEXT: lwz 3, 0(4) +; ASM32PWR4-NEXT: lwz 4, 0(5) +; ASM32PWR4-NEXT: li 5, 0 +; ASM32PWR4-NEXT: stw 5, 60(1) +; ASM32PWR4-NEXT: lis 5, 16352 +; ASM32PWR4-NEXT: stw 5, 56(1) +; ASM32PWR4-NEXT: lis 5, 13107 +; ASM32PWR4-NEXT: ori 5, 5, 13107 +; ASM32PWR4-NEXT: stw 5, 68(1) +; ASM32PWR4-NEXT: lis 5, 16355 +; ASM32PWR4-NEXT: ori 5, 5, 13107 +; ASM32PWR4-NEXT: stw 5, 64(1) +; ASM32PWR4-NEXT: lis 5, 26214 +; ASM32PWR4-NEXT: ori 5, 5, 26214 +; ASM32PWR4-NEXT: stw 5, 76(1) +; ASM32PWR4-NEXT: lis 5, 16358 +; ASM32PWR4-NEXT: ori 5, 5, 26214 +; ASM32PWR4-NEXT: stw 5, 72(1) +; ASM32PWR4-NEXT: lis 5, -26215 +; ASM32PWR4-NEXT: ori 5, 5, 39322 +; ASM32PWR4-NEXT: stw 5, 84(1) +; ASM32PWR4-NEXT: stw 5, 100(1) +; ASM32PWR4-NEXT: lis 5, 16313 +; ASM32PWR4-NEXT: ori 5, 5, 39321 +; ASM32PWR4-NEXT: stw 5, 96(1) +; ASM32PWR4-NEXT: lis 5, -15729 +; ASM32PWR4-NEXT: ori 5, 5, 23593 +; ASM32PWR4-NEXT: stw 5, 108(1) +; ASM32PWR4-NEXT: lis 5, 16316 +; ASM32PWR4-NEXT: ori 5, 5, 10485 +; ASM32PWR4-NEXT: stw 5, 104(1) +; ASM32PWR4-NEXT: lis 5, -5243 +; ASM32PWR4-NEXT: ori 5, 5, 7864 +; ASM32PWR4-NEXT: stw 5, 116(1) +; ASM32PWR4-NEXT: lis 5, 16318 +; ASM32PWR4-NEXT: ori 5, 5, 47185 +; ASM32PWR4-NEXT: stw 6, 80(1) +; ASM32PWR4-NEXT: lis 6, -13108 +; ASM32PWR4-NEXT: ori 6, 6, 52429 +; ASM32PWR4-NEXT: stw 5, 112(1) +; ASM32PWR4-NEXT: lis 5, 2621 +; ASM32PWR4-NEXT: ori 5, 5, 28836 +; ASM32PWR4-NEXT: stw 6, 92(1) +; ASM32PWR4-NEXT: lis 6, 16364 +; ASM32PWR4-NEXT: ori 6, 6, 52428 +; ASM32PWR4-NEXT: stw 5, 124(1) +; ASM32PWR4-NEXT: lis 5, 16320 +; ASM32PWR4-NEXT: ori 5, 5, 41943 +; ASM32PWR4-NEXT: stw 6, 88(1) +; ASM32PWR4-NEXT: lwz 6, L..C22(2) # %const.0 +; ASM32PWR4-NEXT: stw 5, 120(1) +; ASM32PWR4-NEXT: lwz 5, L..C23(2) # %const.1 +; ASM32PWR4-NEXT: lfd 2, 0(6) +; ASM32PWR4-NEXT: lwz 6, L..C24(2) # %const.2 ; ASM32PWR4-NEXT: lfd 3, 0(5) ; ASM32PWR4-NEXT: lwz 5, L..C25(2) # %const.3 -; ASM32PWR4-NEXT: lfd 4, 0(3) -; ASM32PWR4-NEXT: lwz 3, L..C26(2) # %const.4 +; ASM32PWR4-NEXT: lfd 4, 0(6) +; ASM32PWR4-NEXT: lwz 6, L..C26(2) # %const.4 ; ASM32PWR4-NEXT: lfd 6, 0(5) ; ASM32PWR4-NEXT: lwz 5, L..C27(2) # %const.5 -; ASM32PWR4-NEXT: lwz 4, 0(4) -; ASM32PWR4-NEXT: lfd 7, 0(3) -; ASM32PWR4-NEXT: lwz 3, L..C28(2) # %const.6 +; ASM32PWR4-NEXT: lfd 7, 0(6) +; ASM32PWR4-NEXT: lwz 6, L..C28(2) # %const.6 ; ASM32PWR4-NEXT: lfd 8, 0(5) ; ASM32PWR4-NEXT: lwz 5, L..C29(2) # %const.7 -; ASM32PWR4-NEXT: stw 4, 128(1) -; ASM32PWR4-NEXT: lis 4, 16361 -; ASM32PWR4-NEXT: ori 4, 4, 39321 -; ASM32PWR4-NEXT: lfd 9, 0(3) -; ASM32PWR4-NEXT: lwz 3, L..C30(2) # %const.8 +; ASM32PWR4-NEXT: lfd 9, 0(6) +; ASM32PWR4-NEXT: lwz 6, L..C30(2) # %const.8 ; ASM32PWR4-NEXT: lfd 1, 0(5) ; ASM32PWR4-NEXT: lwz 5, L..C31(2) # %const.9 -; ASM32PWR4-NEXT: stw 4, 80(1) -; ASM32PWR4-NEXT: lis 4, -13108 +; ASM32PWR4-NEXT: lfd 11, 0(6) +; ASM32PWR4-NEXT: lwz 6, L..C32(2) # %const.10 ; ASM32PWR4-NEXT: fmr 10, 1 -; ASM32PWR4-NEXT: ori 4, 4, 52429 -; ASM32PWR4-NEXT: lfd 11, 0(3) -; ASM32PWR4-NEXT: lwz 3, L..C32(2) # %const.10 ; ASM32PWR4-NEXT: lfd 12, 0(5) ; ASM32PWR4-NEXT: lwz 5, L..C33(2) # %const.11 -; ASM32PWR4-NEXT: stw 4, 92(1) -; ASM32PWR4-NEXT: lis 4, 16364 -; ASM32PWR4-NEXT: ori 4, 4, 52428 -; ASM32PWR4-NEXT: stfd 0, 152(1) -; ASM32PWR4-NEXT: stw 4, 88(1) -; ASM32PWR4-NEXT: lwz 4, 156(1) -; ASM32PWR4-NEXT: lfd 13, 0(3) +; ASM32PWR4-NEXT: lfd 13, 0(6) ; ASM32PWR4-NEXT: lfs 5, 0(5) -; ASM32PWR4-NEXT: lwz 3, 152(1) -; ASM32PWR4-NEXT: stw 4, 136(1) -; ASM32PWR4-NEXT: stw 3, 132(1) +; ASM32PWR4-NEXT: stfd 0, 132(1) +; ASM32PWR4-NEXT: stw 4, 140(1) +; ASM32PWR4-NEXT: stw 3, 128(1) ; ASM32PWR4-NEXT: bl .test_fpr_stack ; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 160 +; ASM32PWR4-NEXT: addi 1, 1, 144 ; ASM32PWR4-NEXT: lwz 0, 8(1) ; ASM32PWR4-NEXT: mtlr 0 ; ASM32PWR4-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll index 2827155dc1845..8c4f81b65144e 100644 --- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll +++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll @@ -160,7 +160,7 @@ entry: ; CHECK-ASM-LABEL: .main:{{[[:space:]] *}}# %bb.0: ; CHECK-FUNC-LABEL: .csect .main[PR],5{{[[:space:]] *}}# %bb.0 ; COMMON-NEXT: mflr 0 -; COMMON: stw 0, 168(1) +; COMMON: stw 0, 152(1) ; COMMON: mtlr 0 ; COMMON-NEXT: blr ; COMMON-NEXT: L..main0: diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll index 10b04b570fa32..c9890a679b4d2 100644 --- a/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-data.ll @@ -29,8 +29,8 @@ %struct.anon = type <{ i32, double }> @astruct = global [1 x %struct.anon] [%struct.anon <{ i32 1, double 7.000000e+00 }>], align 1 -%struct.anon2 = type { double, i32 } -@bstruct = global [1 x %struct.anon2] [%struct.anon2 { double 7.000000e+00 , i32 1}], align 8 +%struct.anon2 = type { double, i32, [4 x i8] } +@bstruct = global [1 x %struct.anon2] [%struct.anon2 { double 7.000000e+00 , i32 1, [4 x i8] undef }], align 8 @a = common global i32 0, align 4 @b = common global i64 0, align 8 diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll index 682c2b7afe34d..7218c814b30b8 100644 --- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll @@ -114,24 +114,18 @@ define double @double_va_arg(double %a, ...) local_unnamed_addr { ; CHECK: bb.0.entry: ; CHECK-NEXT: liveins: $f1, $r5, $r6, $r7, $r8, $r9, $r10 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $r3 = ADDI %fixed-stack.0, 0 + ; CHECK-NEXT: STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16) + ; CHECK-NEXT: STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4) ; CHECK-NEXT: STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8) - ; CHECK-NEXT: STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16) - ; CHECK-NEXT: STW renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4) ; CHECK-NEXT: STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12) + ; CHECK-NEXT: renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %ir.argp.cur2, align 16) ; CHECK-NEXT: STW killed renamable $r9, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 16) ; CHECK-NEXT: STW killed renamable $r10, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20) - ; CHECK-NEXT: STW renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1) - ; CHECK-NEXT: STW killed renamable $r3, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2) - ; CHECK-NEXT: STW renamable $r5, 0, %stack.2 :: (store (s32) into %stack.2, align 8) - ; CHECK-NEXT: STW renamable $r6, 4, %stack.2 :: (store (s32) into %stack.2 + 4) - ; CHECK-NEXT: renamable $f0 = LFD 0, %stack.2 :: (load (s64) from %stack.2) - ; CHECK-NEXT: STW killed renamable $r5, 0, %stack.3 :: (store (s32) into %stack.3, align 8) - ; CHECK-NEXT: STW killed renamable $r6, 4, %stack.3 :: (store (s32) into %stack.3 + 4) - ; CHECK-NEXT: renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm - ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm - ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD renamable $f0, killed renamable $f1, implicit $rm + ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, renamable $f0, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm + ; CHECK-NEXT: renamable $r3 = ADDI %fixed-stack.0, 0 + ; CHECK-NEXT: STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1) ; CHECK-NEXT: BLR implicit $lr, implicit $rm, implicit $f1 entry: %arg1 = alloca ptr, align 4 @@ -163,31 +157,24 @@ define double @double_stack_va_arg(double %one, double %two, double %three, doub ; CHECK: bb.0.entry: ; CHECK-NEXT: liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %ir.argp.cur142, align 16) + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, renamable $f0, implicit $rm + ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, renamable $f0, implicit $rm + ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm ; CHECK-NEXT: renamable $r3 = ADDI %fixed-stack.0, 0 ; CHECK-NEXT: STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1) - ; CHECK-NEXT: renamable $r3 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142, align 16) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm - ; CHECK-NEXT: STW renamable $r3, 0, %stack.2 :: (store (s32) into %stack.2, align 8) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm - ; CHECK-NEXT: renamable $r4 = LWZ 4, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142 + 4) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm - ; CHECK-NEXT: STW renamable $r4, 4, %stack.2 :: (store (s32) into %stack.2 + 4) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm - ; CHECK-NEXT: renamable $f1 = LFD 0, %stack.2 :: (load (s64) from %stack.2) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm - ; CHECK-NEXT: STW killed renamable $r3, 0, %stack.3 :: (store (s32) into %stack.3, align 8) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm - ; CHECK-NEXT: STW killed renamable $r4, 4, %stack.3 :: (store (s32) into %stack.3 + 4) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm - ; CHECK-NEXT: renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3) - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm - ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm - ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm - ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; CHECK-NEXT: BLR implicit $lr, implicit $rm, implicit $f1 entry: %arg1 = alloca ptr, align 4 diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll index 9cf1e45607042..30727b8d4fe94 100644 --- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll @@ -108,24 +108,18 @@ entry: define double @double_va_arg(double %a, ...) local_unnamed_addr { ; CHECK-LABEL: double_va_arg: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 5, -16(1) -; CHECK-NEXT: addi 3, 1, 32 -; CHECK-NEXT: stw 6, -12(1) -; CHECK-NEXT: lfd 0, -16(1) -; CHECK-NEXT: stw 5, -24(1) -; CHECK-NEXT: fadd 0, 0, 1 -; CHECK-NEXT: stw 6, -20(1) -; CHECK-NEXT: lfd 1, -24(1) -; CHECK-NEXT: fadd 1, 1, 1 -; CHECK-NEXT: stw 7, 40(1) -; CHECK-NEXT: fadd 1, 0, 1 ; CHECK-NEXT: stw 5, 32(1) +; CHECK-NEXT: addi 3, 1, 32 ; CHECK-NEXT: stw 6, 36(1) +; CHECK-NEXT: lfd 0, 32(1) +; CHECK-NEXT: fadd 1, 0, 1 +; CHECK-NEXT: fadd 0, 0, 0 +; CHECK-NEXT: stw 7, 40(1) ; CHECK-NEXT: stw 8, 44(1) +; CHECK-NEXT: fadd 1, 1, 0 ; CHECK-NEXT: stw 9, 48(1) ; CHECK-NEXT: stw 10, 52(1) ; CHECK-NEXT: stw 3, -4(1) -; CHECK-NEXT: stw 3, -8(1) ; CHECK-NEXT: blr entry: %arg1 = alloca ptr, align 4 @@ -155,31 +149,24 @@ entry: define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr { ; CHECK-LABEL: double_stack_va_arg: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fadd 0, 1, 2 +; CHECK-NEXT: fadd 1, 1, 2 +; CHECK-NEXT: lfd 0, 128(1) ; CHECK-NEXT: addi 3, 1, 128 -; CHECK-NEXT: lwz 4, 132(1) -; CHECK-NEXT: fadd 0, 0, 3 +; CHECK-NEXT: fadd 1, 1, 3 ; CHECK-NEXT: stw 3, -4(1) -; CHECK-NEXT: fadd 0, 0, 4 -; CHECK-NEXT: lwz 3, 128(1) -; CHECK-NEXT: fadd 0, 0, 5 -; CHECK-NEXT: stw 3, -16(1) -; CHECK-NEXT: fadd 0, 0, 6 -; CHECK-NEXT: stw 4, -12(1) -; CHECK-NEXT: fadd 0, 0, 7 -; CHECK-NEXT: lfd 1, -16(1) -; CHECK-NEXT: fadd 0, 0, 8 -; CHECK-NEXT: stw 3, -24(1) -; CHECK-NEXT: fadd 0, 0, 9 -; CHECK-NEXT: stw 4, -20(1) -; CHECK-NEXT: fadd 0, 0, 10 -; CHECK-NEXT: fadd 0, 0, 11 -; CHECK-NEXT: fadd 0, 0, 12 -; CHECK-NEXT: fadd 0, 0, 13 -; CHECK-NEXT: fadd 0, 0, 1 -; CHECK-NEXT: lfd 1, -24(1) -; CHECK-NEXT: fadd 1, 1, 1 -; CHECK-NEXT: fadd 1, 0, 1 +; CHECK-NEXT: fadd 1, 1, 4 +; CHECK-NEXT: fadd 1, 1, 5 +; CHECK-NEXT: fadd 1, 1, 6 +; CHECK-NEXT: fadd 1, 1, 7 +; CHECK-NEXT: fadd 1, 1, 8 +; CHECK-NEXT: fadd 1, 1, 9 +; CHECK-NEXT: fadd 1, 1, 10 +; CHECK-NEXT: fadd 1, 1, 11 +; CHECK-NEXT: fadd 1, 1, 12 +; CHECK-NEXT: fadd 1, 1, 13 +; CHECK-NEXT: fadd 1, 1, 0 +; CHECK-NEXT: fadd 0, 0, 0 +; CHECK-NEXT: fadd 1, 1, 0 ; CHECK-NEXT: blr entry: %arg1 = alloca ptr, align 4 diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll index dc62e18378e72..af13552ed5949 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll @@ -113,10 +113,10 @@ define double @double_va_arg(double %a, ...) local_unnamed_addr { ; CHECK-NEXT: renamable $x5 = ADDI8 %fixed-stack.0, 8 ; CHECK-NEXT: STD killed renamable $x3, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1) ; CHECK-NEXT: STD killed renamable $x5, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1) - ; CHECK-NEXT: renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64)) + ; CHECK-NEXT: renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %fixed-stack.0) ; CHECK-NEXT: renamable $x3 = ADDI8 renamable $x4, 8 ; CHECK-NEXT: STD killed renamable $x3, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2) - ; CHECK-NEXT: renamable $f2 = LFD 0, killed renamable $x4 :: (load (s64)) + ; CHECK-NEXT: renamable $f2 = LFD 0, killed renamable $x4 :: (load (s64), align 4) ; CHECK-NEXT: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm @@ -145,7 +145,7 @@ define double @double_stack_va_arg(double %one, double %two, double %three, doub ; CHECK: bb.0.entry: ; CHECK-NEXT: liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64)) + ; CHECK-NEXT: renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %fixed-stack.0) ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm ; CHECK-NEXT: renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp index 57e15a48c0bff..898141cbcf978 100644 --- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp +++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp @@ -105,7 +105,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) { "E-m:e-Fn32-i64:64-i128:128-n32:64"); EXPECT_EQ( UpgradeDataLayoutString("E-m:a-Fi64-i64:64-n32:64", "powerpc64-ibm-aix"), - "E-m:a-Fi64-i64:64-i128:128-n32:64"); + "E-m:a-Fi64-i64:64-i128:128-n32:64-f64:32:64"); // Check that WebAssembly targets add -i128:128. EXPECT_EQ( @@ -189,6 +189,16 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) { "E-m:e-Fn32-i64:64-n32"); EXPECT_EQ(UpgradeDataLayoutString("E-m:a-Fi64-i64:64-n32", "powerpc-aix"), "E-m:a-Fi64-i64:64-n32"); + + EXPECT_EQ(UpgradeDataLayoutString("E-m:a-p:32:32-Fi32-i64:64-n32", + "powerpc-unknown-aix"), + "E-m:a-p:32:32-Fi32-i64:64-n32-f64:32:64"); + EXPECT_EQ( + UpgradeDataLayoutString( + "E-m:a-Fi64-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512", + "powerpc64-unknown-aix"), + "E-m:a-Fi64-i64:64-i128:128-n32:64-f64:32:64-S128-v256:256:256-v512:512:" + "512"); } TEST(DataLayoutUpgradeTest, EmptyDataLayout) { From 6ad0c7c6f1802abdde55f2c94e98a5fb445e0d3d Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Thu, 11 Dec 2025 15:58:06 +0800 Subject: [PATCH 27/49] [NFC][RISCV] Unify all zvfbfa vl patterns and sd node patterns (#171072) This patch try to move all vl patterns and sd node patterns to RISCVInstrInfoVVLPatterns.td and RISCVInstrInfoVSDPatterns.td respectively. It removes redefinition of pattern classes for zvfbfa and make it easier to maintain and change. Note: this does not include intrinsic patterns, if we want to also unify intrinsic patterns we need to also move pseudo instruction definitions of zvfbfa to RISCVInstrInfoVPseudos.td. --- llvm/lib/Target/RISCV/RISCVFeatures.td | 1 + llvm/lib/Target/RISCV/RISCVInstrInfoV.td | 3 + .../Target/RISCV/RISCVInstrInfoVPseudos.td | 44 ++-- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 106 ++++++--- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 178 +++++++++++--- llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td | 223 ------------------ 6 files changed, 245 insertions(+), 310 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 1a5bb837a4318..39228a11e1309 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -908,6 +908,7 @@ def HasVInstructionsF16Minimal : Predicate<"Subtarget->hasVInstructionsF16Minima def HasVInstructionsBF16Minimal : Predicate<"Subtarget->hasVInstructionsBF16Minimal()">; def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">; +def HasVInstructionsBF16 : Predicate<"Subtarget->hasVInstructionsBF16()">; def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">; def HasVInstructionsFullMultiply : Predicate<"Subtarget->hasVInstructionsFullMultiply()">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 594a75a4746d4..9354b63bced53 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -1840,3 +1840,6 @@ let Predicates = [HasVInstructionsI64, IsRV64] in { include "RISCVInstrInfoVPseudos.td" include "RISCVInstrInfoZvfbf.td" +// Include the non-intrinsic ISel patterns +include "RISCVInstrInfoVVLPatterns.td" +include "RISCVInstrInfoVSDPatterns.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index e36204c536c0d..cdbeb0c1046d2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -473,17 +473,27 @@ defset list AllWidenableIntVectors = { def : VTypeInfoToWide; } -defset list AllWidenableFloatVectors = { - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; +defset list AllWidenableFloatAndBF16Vectors = { + defset list AllWidenableFloatVectors = { + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + } + + defset list AllWidenableBF16ToFloatVectors = { + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + def : VTypeInfoToWide; + } } defset list AllFractionableVF2IntVectors = { @@ -543,14 +553,6 @@ defset list AllWidenableIntToFloatVectors = { def : VTypeInfoToWide; } -defset list AllWidenableBF16ToFloatVectors = { - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; - def : VTypeInfoToWide; -} - // This class holds the record of the RISCVVPseudoTable below. // This represents the information we need in codegen for each pseudo. // The definition should be consistent with `struct PseudoInfo` in @@ -780,7 +782,7 @@ class GetVRegNoV0 { class GetVTypePredicates { list Predicates = !cond(!eq(vti.Scalar, f16) : [HasVInstructionsF16], - !eq(vti.Scalar, bf16) : [HasVInstructionsBF16Minimal], + !eq(vti.Scalar, bf16) : [HasVInstructionsBF16], !eq(vti.Scalar, f32) : [HasVInstructionsAnyF], !eq(vti.Scalar, f64) : [HasVInstructionsF64], !eq(vti.SEW, 64) : [HasVInstructionsI64], @@ -7326,7 +7328,3 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", // 16.5. Vector Compress Instruction //===----------------------------------------------------------------------===// defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllVectors>; - -// Include the non-intrinsic ISel patterns -include "RISCVInstrInfoVVLPatterns.td" -include "RISCVInstrInfoVSDPatterns.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index a67112b9981b8..14ad7ca0eb35a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -215,13 +215,17 @@ multiclass VPatBinaryFPSDNode_VV_VF { - foreach vti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in { + list vtilist = AllFloatVectors, + bit isSEWAware = 0> { + foreach vti = vtilist in { let Predicates = GetVTypePredicates.Predicates in { - def : VPatBinarySDNode_VV_RM; - def : VPatBinarySDNode_VF_RM; @@ -246,14 +250,17 @@ multiclass VPatBinaryFPSDNode_R_VF { - foreach fvti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in + list vtilist = AllFloatVectors, + bit isSEWAware = 0> { + foreach fvti = vtilist in let Predicates = GetVTypePredicates.Predicates in def : Pat<(fvti.Vector (vop (fvti.Vector (SplatFPOp fvti.Scalar:$rs2)), (fvti.Vector fvti.RegClass:$rs1))), (!cast( !if(isSEWAware, - instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW, + instruction_name# + !if(!eq(fvti.Scalar, bf16), "_ALT", "")# + "_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW, instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)) (fvti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, @@ -664,11 +671,10 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM.Predicates, - GetVTypePredicates.Predicates, + let Predicates = !listconcat(GetVTypePredicates.Predicates, !if(!eq(vti.Scalar, bf16), [HasStdExtZvfbfwma], - [])) in { + GetVTypePredicates.Predicates)) in { def : Pat<(fma (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), (XLenVT srcvalue))), @@ -676,7 +682,9 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM(instruction_name#"_VV_"#suffix) + (!cast(instruction_name# + !if(!eq(vti.Scalar, bf16), "BF16", "")# + "_VV_"#suffix) wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR @@ -688,7 +696,9 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM(instruction_name#"_V"#vti.ScalarSuffix#"_"#suffix) + (!cast(instruction_name# + !if(!eq(vti.Scalar, bf16), "BF16", "")# + "_V"#vti.ScalarSuffix#"_"#suffix) wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR @@ -1201,16 +1211,20 @@ foreach mti = AllMasks in { // 13. Vector Floating-Point Instructions // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions -defm : VPatBinaryFPSDNode_VV_VF_RM; -defm : VPatBinaryFPSDNode_VV_VF_RM; -defm : VPatBinaryFPSDNode_R_VF_RM; +defm : VPatBinaryFPSDNode_VV_VF_RM; +defm : VPatBinaryFPSDNode_VV_VF_RM; +defm : VPatBinaryFPSDNode_R_VF_RM; // 13.3. Vector Widening Floating-Point Add/Subtract Instructions defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF_RM; defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF_RM; // 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions -defm : VPatBinaryFPSDNode_VV_VF_RM; +defm : VPatBinaryFPSDNode_VV_VF_RM; defm : VPatBinaryFPSDNode_VV_VF_RM; defm : VPatBinaryFPSDNode_R_VF_RM; @@ -1314,14 +1328,15 @@ foreach fvti = AllFloatVectors in { // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACC", - AllWidenableFloatVectors>; + AllWidenableFloatAndBF16Vectors>; defm : VPatWidenFPNegMulAccSDNode_VV_VF_RM<"PseudoVFWNMACC">; defm : VPatWidenFPMulSacSDNode_VV_VF_RM<"PseudoVFWMSAC">; defm : VPatWidenFPNegMulSacSDNode_VV_VF_RM<"PseudoVFWNMSAC">; -foreach vti = AllFloatVectors in { +foreach vti = AllFloatAndBF16Vectors in { let Predicates = GetVTypePredicates.Predicates in { // 13.8. Vector Floating-Point Square-Root Instruction + if !ne(vti.Scalar, bf16) then def : Pat<(any_fsqrt (vti.Vector vti.RegClass:$rs2)), (!cast("PseudoVFSQRT_V_"# vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), @@ -1333,34 +1348,46 @@ foreach vti = AllFloatVectors in { // 13.12. Vector Floating-Point Sign-Injection Instructions def : Pat<(fabs (vti.Vector vti.RegClass:$rs)), - (!cast("PseudoVFSGNJX_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (!cast("PseudoVFSGNJX"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>; // Handle fneg with VFSGNJN using the same input for both operands. def : Pat<(fneg (vti.Vector vti.RegClass:$rs)), - (!cast("PseudoVFSGNJN_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (!cast("PseudoVFSGNJN"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>; def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2))), - (!cast("PseudoVFSGNJ_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (!cast("PseudoVFSGNJ"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))), - (!cast("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) + (!cast("PseudoVFSGNJ"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), (vti.Vector (fneg vti.RegClass:$rs2)))), - (!cast("PseudoVFSGNJN_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (!cast("PseudoVFSGNJN"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))), - (!cast("PseudoVFSGNJN_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) + (!cast("PseudoVFSGNJN"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; } @@ -1446,13 +1473,28 @@ defm : VPatNConvertFP2ISDNode_W; defm : VPatNConvertFP2ISDNode_W; defm : VPatNConvertI2FPSDNode_W_RM; defm : VPatNConvertI2FPSDNode_W_RM; -foreach fvtiToFWti = AllWidenableFloatVectors in { +foreach fvtiToFWti = AllWidenableFloatAndBF16Vectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; - let Predicates = !listconcat(GetVTypeMinimalPredicates.Predicates, - GetVTypeMinimalPredicates.Predicates) in + let Predicates = !listconcat(GetVTypeMinimalPredicates.Predicates, + !if(!eq(fvti.Scalar, bf16), + [HasStdExtZvfbfmin], + GetVTypeMinimalPredicates.Predicates)) in + def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), + (!cast("PseudoVFNCVT"# + !if(!eq(fvti.Scalar, bf16), "BF16", "")# + "_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW) + (fvti.Vector (IMPLICIT_DEF)), + fwti.RegClass:$rs1, + // Value to indicate no rounding mode change in + // RISCVInsertReadWriteCSR + FRM_DYN, + fvti.AVL, fvti.Log2SEW, TA_MA)>; + // Define vfncvt.f.f.w for bf16 when Zvfbfa is enabled. + if !eq(fvti.Scalar, bf16) then + let Predicates = [HasVInstructionsBF16] in def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), - (!cast("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW) + (!cast("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW) (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, // Value to indicate no rounding mode change in @@ -1464,10 +1506,10 @@ foreach fvtiToFWti = AllWidenableFloatVectors in { //===----------------------------------------------------------------------===// // Vector Element Extracts //===----------------------------------------------------------------------===// -foreach vti = NoGroupFloatVectors in { - defvar vfmv_f_s_inst = !cast(!strconcat("PseudoVFMV_", - vti.ScalarSuffix, - "_S")); +foreach vti = !listconcat(NoGroupFloatVectors, NoGroupBF16Vectors) in { + defvar vfmv_f_s_inst = + !cast(!strconcat("PseudoVFMV_", vti.ScalarSuffix, + "_S", !if(!eq(vti.Scalar, bf16), "_ALT", ""))); // Only pattern-match extract-element operations where the index is 0. Any // other index will have been custom-lowered to slide the vector correctly // into place. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 38edab5400291..9273ce094eb0a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -1058,14 +1058,18 @@ multiclass VPatBinaryFPVL_VV_VF { - foreach vti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in { + list vtilist = AllFloatVectors, + bit isSEWAware = 0> { + foreach vti = vtilist in { let Predicates = GetVTypePredicates.Predicates in { - def : VPatBinaryVL_V_RM; - def : VPatBinaryVL_VF_RM; @@ -1093,8 +1097,9 @@ multiclass VPatBinaryFPVL_R_VF { - foreach fvti = !if(isBF16, AllBF16Vectors, AllFloatVectors) in { + list vtilist = AllFloatVectors, + bit isSEWAware = 0> { + foreach fvti = vtilist in { let Predicates = GetVTypePredicates.Predicates in def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2), fvti.RegClass:$rs1, @@ -1103,7 +1108,9 @@ multiclass VPatBinaryFPVL_R_VF_RM( !if(isSEWAware, - instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK", + instruction_name# + !if(!eq(fvti.Scalar, bf16), "_ALT", "")# + "_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK", instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")) fvti.RegClass:$passthru, fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2, @@ -1832,16 +1839,17 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM.Predicates, - GetVTypePredicates.Predicates, + let Predicates = !listconcat(GetVTypePredicates.Predicates, !if(!eq(vti.Scalar, bf16), [HasStdExtZvfbfwma], - [])) in { + GetVTypePredicates.Predicates)) in { def : Pat<(vop (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2), (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm), VLOpFrag), - (!cast(instruction_name#"_VV_"#suffix#"_MASK") + (!cast(instruction_name# + !if(!eq(vti.Scalar, bf16), "BF16", "")# + "_VV_"#suffix#"_MASK") wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in @@ -1852,7 +1860,9 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM(instruction_name#"_V"#vti.ScalarSuffix#"_"#suffix#"_MASK") + (!cast(instruction_name# + !if(!eq(vti.Scalar, bf16), "BF16", "")# + "_V"#vti.ScalarSuffix#"_"#suffix#"_MASK") wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in @@ -2296,9 +2306,12 @@ foreach vtiTowti = AllWidenableIntVectors in { // 13. Vector Floating-Point Instructions // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions -defm : VPatBinaryFPVL_VV_VF_RM; -defm : VPatBinaryFPVL_VV_VF_RM; -defm : VPatBinaryFPVL_R_VF_RM; +defm : VPatBinaryFPVL_VV_VF_RM; +defm : VPatBinaryFPVL_VV_VF_RM; +defm : VPatBinaryFPVL_R_VF_RM; // 13.3. Vector Widening Floating-Point Add/Subtract Instructions defm : VPatBinaryFPWVL_VV_VF_WV_WF_RM; // 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions -defm : VPatBinaryFPVL_VV_VF_RM; +defm : VPatBinaryFPVL_VV_VF_RM; defm : VPatBinaryFPVL_VV_VF_RM; defm : VPatBinaryFPVL_R_VF_RM; @@ -2321,7 +2335,8 @@ defm : VPatFPMulAddVL_VV_VF_RM; defm : VPatFPMulAddVL_VV_VF_RM; // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions -defm : VPatWidenFPMulAccVL_VV_VF_RM; +defm : VPatWidenFPMulAccVL_VV_VF_RM; defm : VPatWidenFPMulAccVL_VV_VF_RM; defm : VPatWidenFPMulAccVL_VV_VF_RM; defm : VPatWidenFPMulAccVL_VV_VF_RM; @@ -2423,6 +2438,66 @@ foreach vti = AllFloatVectors in { } } +foreach vti = AllBF16Vectors in { + let Predicates = GetVTypePredicates.Predicates in { + // 13.12. Vector Floating-Point Sign-Injection Instructions + def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast("PseudoVFSGNJX"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK") + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, + vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TA_MA)>; + // Handle fneg with VFSGNJN using the same input for both operands. + def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast("PseudoVFSGNJN"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK") + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, + vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TA_MA)>; + + def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), + (vti.Vector vti.RegClass:$rs2), + vti.RegClass:$passthru, + (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast("PseudoVFSGNJ"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") + vti.RegClass:$passthru, vti.RegClass:$rs1, + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TAIL_AGNOSTIC)>; + + def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), + (riscv_fneg_vl vti.RegClass:$rs2, + (vti.Mask true_mask), + VLOpFrag), + srcvalue, + (vti.Mask true_mask), + VLOpFrag), + (!cast("PseudoVFSGNJN"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_VV_"# vti.LMul.MX#"_E"#vti.SEW) + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs1, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TA_MA)>; + + def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), + (SplatFPOp vti.ScalarRegClass:$rs2), + vti.RegClass:$passthru, + (vti.Mask VMV0:$vm), + VLOpFrag), + (!cast("PseudoVFSGNJ"# + !if(!eq(vti.Scalar, bf16), "_ALT", "")# + "_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") + vti.RegClass:$passthru, vti.RegClass:$rs1, + vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, + TAIL_AGNOSTIC)>; + } +} + // Floating-point vselects: // 11.15. Vector Integer Merge Instructions // 13.15. Vector Floating-Point Merge Instruction @@ -2476,7 +2551,7 @@ foreach fvti = AllFloatVectors in { } } -foreach fvti = AllFloatVectors in { +foreach fvti = AllFloatAndBF16Vectors in { defvar ivti = GetIntVTypeInfo.Vti; let Predicates = GetVTypePredicates.Predicates in { // 13.16. Vector Floating-Point Move Instruction @@ -2492,11 +2567,13 @@ foreach fvti = AllFloatVectors in { } } -foreach fvti = AllFloatVectors in { +foreach fvti = AllFloatAndBF16Vectors in { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), - (!cast("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" # + (!cast("PseudoVFMV_V" # + !if(!eq(fvti.Scalar, bf16), "_ALT_", "_") # + fvti.ScalarSuffix # "_" # fvti.LMul.MX) $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), GPR:$vl, fvti.Log2SEW, TU_MU)>; @@ -2526,20 +2603,37 @@ defm : VPatWConvertFP2IVL_V; defm : VPatWConvertI2FPVL_V; -foreach fvtiToFWti = AllWidenableFloatVectors in { +foreach fvtiToFWti = AllWidenableFloatAndBF16Vectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; - // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable. - let Predicates = !listconcat(GetVTypeMinimalPredicates.Predicates, - GetVTypeMinimalPredicates.Predicates) in + // Define vfwcvt.f.f.v for f16 when Zvfhmin is enabled. + // Define vfwcvtbf16.f.f.v for bf16 when Zvfbfmin is enabled. + let Predicates = !listconcat(GetVTypeMinimalPredicates.Predicates, + !if(!eq(fvti.Scalar, bf16), + [HasStdExtZvfbfmin], + GetVTypeMinimalPredicates.Predicates)) in { def : Pat<(fwti.Vector (any_riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1), (fvti.Mask VMV0:$vm), VLOpFrag)), - (!cast("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (!cast("PseudoVFWCVT"# + !if(!eq(fvti.Scalar, bf16), "BF16", "")# + "_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; + + // Define vfwcvt.f.f.v for bf16 when Zvfbfa is enabled. + if !eq(fvti.Scalar, bf16) then + let Predicates = [HasVInstructionsBF16] in + def : Pat<(fwti.Vector (any_riscv_fpextend_vl + (fvti.Vector fvti.RegClass:$rs1), + (fvti.Mask VMV0:$vm), + VLOpFrag)), + (!cast("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, + (fvti.Mask VMV0:$vm), + GPR:$vl, fvti.Log2SEW, TA_MA)>; } // 13.19 Narrowing Floating-Point/Integer Type-Convert Instructions @@ -2555,16 +2649,21 @@ defm : VPatNConvertI2FPVL_W_RM; defm : VPatNConvertI2FP_RM_VL_W; defm : VPatNConvertI2FP_RM_VL_W; -foreach fvtiToFWti = AllWidenableFloatVectors in { +foreach fvtiToFWti = AllWidenableFloatAndBF16Vectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; - // Define vfncvt.f.f.w for f16 when Zvfhmin is enable. - let Predicates = !listconcat(GetVTypeMinimalPredicates.Predicates, - GetVTypeMinimalPredicates.Predicates) in { + // Define vfncvt.f.f.w for f16 when Zvfhmin is enabled. + // Define vfncvtbf16.f.f.w for bf16 when Zvfbfmin is enabled. + let Predicates = !listconcat(GetVTypeMinimalPredicates.Predicates, + !if(!eq(fvti.Scalar, bf16), + [HasStdExtZvfbfmin], + GetVTypeMinimalPredicates.Predicates)) in def : Pat<(fvti.Vector (any_riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1), (fwti.Mask VMV0:$vm), VLOpFrag)), - (!cast("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (!cast("PseudoVFNCVT"# + !if(!eq(fvti.Scalar, bf16), "BF16", "")# + "_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, (fwti.Mask VMV0:$vm), // Value to indicate no rounding mode change in @@ -2581,6 +2680,20 @@ foreach fvtiToFWti = AllWidenableFloatVectors in { (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, (fwti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; } + + // Define vfncvt.f.f.w for bf16 when Zvfbfa is enabled. + if !eq(fvti.Scalar, bf16) then + let Predicates = [HasVInstructionsBF16] in + def : Pat<(fvti.Vector (any_riscv_fpround_vl + (fwti.Vector fwti.RegClass:$rs1), + (fwti.Mask VMV0:$vm), VLOpFrag)), + (!cast("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") + (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, + (fwti.Mask VMV0:$vm), + // Value to indicate no rounding mode change in + // RISCVInsertReadWriteCSR + FRM_DYN, + GPR:$vl, fvti.Log2SEW, TA_MA)>; } // 14. Vector Reduction Operations @@ -2751,7 +2864,7 @@ foreach vti = AllIntegerVectors in { } // 16.2. Floating-Point Scalar Move Instructions -foreach vti = NoGroupFloatVectors in { +foreach vti = !listconcat(NoGroupFloatVectors, NoGroupBF16Vectors) in { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), (vti.Scalar (fpimm0)), @@ -2764,7 +2877,8 @@ foreach vti = NoGroupFloatVectors in { def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), vti.ScalarRegClass:$rs1, VLOpFrag)), - (!cast("PseudoVFMV_S_"#vti.ScalarSuffix) + (!cast("PseudoVFMV_S_"#vti.ScalarSuffix# + !if(!eq(vti.Scalar, bf16), "_ALT", "")) vti.RegClass:$passthru, (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td index e24e4a33288f7..866e831fdcd94 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td @@ -406,47 +406,11 @@ let Predicates = [HasStdExtZvfbfmin] in { "PseudoVFWCVTBF16_F_F", isSEWAware=1>; defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w", "PseudoVFNCVTBF16_F_F", isSEWAware=1>; - - foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { - defvar fvti = fvtiToFWti.Vti; - defvar fwti = fvtiToFWti.Wti; - def : Pat<(fwti.Vector (any_riscv_fpextend_vl - (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask VMV0:$vm), - VLOpFrag)), - (!cast("PseudoVFWCVTBF16_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") - (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask VMV0:$vm), - GPR:$vl, fvti.Log2SEW, TA_MA)>; - - def : Pat<(fvti.Vector (any_riscv_fpround_vl - (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask VMV0:$vm), VLOpFrag)), - (!cast("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") - (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, fvti.Log2SEW, TA_MA)>; - def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), - (!cast("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW) - (fvti.Vector (IMPLICIT_DEF)), - fwti.RegClass:$rs1, - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - fvti.AVL, fvti.Log2SEW, TA_MA)>; - } } let Predicates = [HasStdExtZvfbfwma] in { defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmaccbf16", "PseudoVFWMACCBF16", AllWidenableBF16ToFloatVectors, isSEWAware=1>; - defm : VPatWidenFPMulAccVL_VV_VF_RM; - defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16", - AllWidenableBF16ToFloatVectors>; } multiclass VPatConversionVI_VF_BF16 { @@ -614,191 +578,4 @@ defm : VPatConversionVF_WF_BF16<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_ isSEWAware=1>; defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP_ALT", AllBF16Vectors>; defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN_ALT", AllBF16Vectors>; - -foreach fvti = AllBF16Vectors in { - defvar ivti = GetIntVTypeInfo.Vti; - let Predicates = GetVTypePredicates.Predicates in { - // 13.16. Vector Floating-Point Move Instruction - // If we're splatting fpimm0, use vmv.v.x vd, x0. - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)), - (!cast("PseudoVMV_V_I_"#fvti.LMul.MX) - $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>; - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)), - (!cast("PseudoVMV_V_X_"#fvti.LMul.MX) - $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>; - } - - let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), - (!cast("PseudoVFMV_V_ALT_" # fvti.ScalarSuffix # "_" # - fvti.LMul.MX) - $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), - GPR:$vl, fvti.Log2SEW, TU_MU)>; - } -} - -foreach vti = NoGroupBF16Vectors in { - let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), - (vti.Scalar (fpimm0)), - VLOpFrag)), - (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>; - def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), - (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), - VLOpFrag)), - (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>; - def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), - vti.ScalarRegClass:$rs1, - VLOpFrag)), - (!cast("PseudoVFMV_S_"#vti.ScalarSuffix#"_ALT") - vti.RegClass:$passthru, - (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>; - } - - defvar vfmv_f_s_inst = !cast(!strconcat("PseudoVFMV_", - vti.ScalarSuffix, - "_S_ALT")); - // Only pattern-match extract-element operations where the index is 0. Any - // other index will have been custom-lowered to slide the vector correctly - // into place. - let Predicates = GetVTypePredicates.Predicates in - def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)), - (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>; -} - -let Predicates = [HasStdExtZvfbfa] in { - foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in { - defvar fvti = fvtiToFWti.Vti; - defvar fwti = fvtiToFWti.Wti; - def : Pat<(fwti.Vector (any_riscv_fpextend_vl - (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask VMV0:$vm), - VLOpFrag)), - (!cast("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") - (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask VMV0:$vm), - GPR:$vl, fvti.Log2SEW, TA_MA)>; - - def : Pat<(fvti.Vector (any_riscv_fpround_vl - (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask VMV0:$vm), VLOpFrag)), - (!cast("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") - (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask VMV0:$vm), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, fvti.Log2SEW, TA_MA)>; - def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), - (!cast("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW) - (fvti.Vector (IMPLICIT_DEF)), - fwti.RegClass:$rs1, - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - fvti.AVL, fvti.Log2SEW, TA_MA)>; - } - - foreach vti = AllBF16Vectors in { - // 13.12. Vector Floating-Point Sign-Injection Instructions - def : Pat<(fabs (vti.Vector vti.RegClass:$rs)), - (!cast("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>; - // Handle fneg with VFSGNJN using the same input for both operands. - def : Pat<(fneg (vti.Vector vti.RegClass:$rs)), - (!cast("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>; - - def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), - (vti.Vector vti.RegClass:$rs2))), - (!cast("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; - def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), - (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))), - (!cast("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; - - def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), - (vti.Vector (fneg vti.RegClass:$rs2)))), - (!cast("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; - def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1), - (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))), - (!cast("PseudoVFSGNJN_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>; - - // 13.12. Vector Floating-Point Sign-Injection Instructions - def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), - VLOpFrag), - (!cast("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK") - (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, - vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, - TA_MA)>; - // Handle fneg with VFSGNJN using the same input for both operands. - def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), - VLOpFrag), - (!cast("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK") - (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, - vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, - TA_MA)>; - - def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), - (vti.Vector vti.RegClass:$rs2), - vti.RegClass:$passthru, - (vti.Mask VMV0:$vm), - VLOpFrag), - (!cast("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") - vti.RegClass:$passthru, vti.RegClass:$rs1, - vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, - TAIL_AGNOSTIC)>; - - def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), - (riscv_fneg_vl vti.RegClass:$rs2, - (vti.Mask true_mask), - VLOpFrag), - srcvalue, - (vti.Mask true_mask), - VLOpFrag), - (!cast("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW) - (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, - vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TA_MA)>; - - def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), - (SplatFPOp vti.ScalarRegClass:$rs2), - vti.RegClass:$passthru, - (vti.Mask VMV0:$vm), - VLOpFrag), - (!cast("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") - vti.RegClass:$passthru, vti.RegClass:$rs1, - vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, - TAIL_AGNOSTIC)>; - } - } - - defm : VPatBinaryFPSDNode_VV_VF_RM; - defm : VPatBinaryFPSDNode_VV_VF_RM; - defm : VPatBinaryFPSDNode_VV_VF_RM; - defm : VPatBinaryFPSDNode_R_VF_RM; - - defm : VPatBinaryFPVL_VV_VF_RM; - defm : VPatBinaryFPVL_VV_VF_RM; - defm : VPatBinaryFPVL_VV_VF_RM; - defm : VPatBinaryFPVL_R_VF_RM; } // Predicates = [HasStdExtZvfbfa] From 794551dbffc0910c5b56660f663daacffa1bcdaa Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Thu, 11 Dec 2025 15:58:27 +0800 Subject: [PATCH 28/49] [RISCV][llvm] Support PSRA, PSRAI, PSRL, PSRLI codegen for P extension (#171460) --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 29 +- llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 53 +++- llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll | 294 ++++++++++++++++++++ llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll | 152 ++++++++++ 4 files changed, 507 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a9819c65c2170..2c0a02ae396c7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -526,7 +526,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal); setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal); setOperationAction(ISD::SPLAT_VECTOR, VTs, Legal); - setOperationAction(ISD::SHL, VTs, Custom); + setOperationAction({ISD::SHL, ISD::SRL, ISD::SRA}, VTs, Custom); setOperationAction(ISD::BITCAST, VTs, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom); } @@ -8662,22 +8662,21 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VSELECT: return lowerToScalableOp(Op, DAG); case ISD::SHL: - if (Subtarget.enablePExtCodeGen() && - Op.getSimpleValueType().isFixedLengthVector()) { - // We have patterns for scalar/immediate shift amount, so no lowering - // needed. - if (Op.getOperand(1)->getOpcode() == ISD::SPLAT_VECTOR) - return Op; - - // There's no vector-vector version of shift instruction in P extension so - // we need to unroll to scalar computation and pack them back. - return DAG.UnrollVectorOp(Op.getNode()); - } - [[fallthrough]]; - case ISD::SRA: case ISD::SRL: - if (Op.getSimpleValueType().isFixedLengthVector()) + case ISD::SRA: + if (Op.getSimpleValueType().isFixedLengthVector()) { + if (Subtarget.enablePExtCodeGen()) { + // We have patterns for scalar/immediate shift amount, so no lowering + // needed. + if (Op.getOperand(1)->getOpcode() == ISD::SPLAT_VECTOR) + return Op; + + // There's no vector-vector version of shift instruction in P extension + // so we need to unroll to scalar computation and pack them back. + return DAG.UnrollVectorOp(Op.getNode()); + } return lowerToScalableOp(Op, DAG); + } // This can be called for an i32 shift amount that needs to be promoted. assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7250a48bfe895..da4a3a6022337 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1513,26 +1513,55 @@ let Predicates = [HasStdExtP] in { def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PABD_H GPR:$rs1, GPR:$rs2)>; def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PABDU_H GPR:$rs1, GPR:$rs2)>; - // 8-bit logical shift left patterns + // 8-bit logical shift left/right patterns def: Pat<(XLenVecI8VT (shl GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))), (PSLLI_B GPR:$rs1, uimm3:$shamt)>; + def: Pat<(XLenVecI8VT (srl GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))), + (PSRLI_B GPR:$rs1, uimm3:$shamt)>; - // 16-bit logical shift left patterns + // 16-bit logical shift left/right patterns def: Pat<(XLenVecI16VT (shl GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))), (PSLLI_H GPR:$rs1, uimm4:$shamt)>; + def: Pat<(XLenVecI16VT (srl GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))), + (PSRLI_H GPR:$rs1, uimm4:$shamt)>; + + // 8-bit arithmetic shift right patterns + def: Pat<(XLenVecI8VT (sra GPR:$rs1, (XLenVecI8VT (splat_vector uimm3:$shamt)))), + (PSRAI_B GPR:$rs1, uimm3:$shamt)>; + + // 16-bit arithmetic shift right patterns + def: Pat<(XLenVecI16VT (sra GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))), + (PSRAI_H GPR:$rs1, uimm4:$shamt)>; // 16-bit signed saturation shift left patterns def: Pat<(XLenVecI16VT (sshlsat GPR:$rs1, (XLenVecI16VT (splat_vector uimm4:$shamt)))), (PSSLAI_H GPR:$rs1, uimm4:$shamt)>; - // 8-bit logical shift left + // 8-bit logical shift left/right def: Pat<(XLenVecI8VT (shl GPR:$rs1, (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))), (PSLL_BS GPR:$rs1, GPR:$rs2)>; - // 16-bit logical shift left + def: Pat<(XLenVecI8VT (srl GPR:$rs1, + (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))), + (PSRL_BS GPR:$rs1, GPR:$rs2)>; + + // 8-bit arithmetic shift left/right + def: Pat<(XLenVecI8VT (sra GPR:$rs1, + (XLenVecI8VT (splat_vector (XLenVT GPR:$rs2))))), + (PSRA_BS GPR:$rs1, GPR:$rs2)>; + + // 16-bit logical shift left/right def: Pat<(XLenVecI16VT (shl GPR:$rs1, (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))), (PSLL_HS GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (srl GPR:$rs1, + (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))), + (PSRL_HS GPR:$rs1, GPR:$rs2)>; + + // 16-bit arithmetic shift left/right + def: Pat<(XLenVecI16VT (sra GPR:$rs1, + (XLenVecI16VT (splat_vector (XLenVT GPR:$rs2))))), + (PSRA_HS GPR:$rs1, GPR:$rs2)>; // 8-bit PLI SD node pattern def: Pat<(XLenVecI8VT (splat_vector simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; @@ -1580,16 +1609,28 @@ let Predicates = [HasStdExtP, IsRV64] in { def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>; def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>; - // 32-bit logical shift left + // 32-bit logical shift left/right def: Pat<(v2i32 (shl GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))), (PSLL_WS GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (srl GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))), + (PSRL_WS GPR:$rs1, GPR:$rs2)>; + + // 32-bit arithmetic shift left/right + def: Pat<(v2i32 (sra GPR:$rs1, (v2i32 (splat_vector (XLenVT GPR:$rs2))))), + (PSRA_WS GPR:$rs1, GPR:$rs2)>; // splat pattern def: Pat<(v2i32 (splat_vector (XLenVT GPR:$rs2))), (PADD_WS (XLenVT X0), GPR:$rs2)>; - // 32-bit logical shift left patterns + // 32-bit logical shift left/right patterns def: Pat<(v2i32 (shl GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))), (PSLLI_W GPR:$rs1, uimm5:$shamt)>; + def: Pat<(v2i32 (srl GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))), + (PSRLI_W GPR:$rs1, uimm5:$shamt)>; + + // 32-bit arithmetic shift left/right patterns + def: Pat<(v2i32 (sra GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))), + (PSRAI_W GPR:$rs1, uimm5:$shamt)>; // 32-bit signed saturation shift left patterns def: Pat<(v2i32 (sshlsat GPR:$rs1, (v2i32 (splat_vector uimm5:$shamt)))), diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll index cd59aa03597e2..1e1110f0a30b8 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -638,6 +638,60 @@ define void @test_psslai_h(ptr %ret_ptr, ptr %a_ptr) { ret void } +; Test logical shift right immediate +define void @test_psrli_h(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psrli.h a1, a1, 2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %res = lshr <2 x i16> %a, splat(i16 2) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psrli_b(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psrli.b a1, a1, 2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %res = lshr <4 x i8> %a, splat(i8 2) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test arithmetic shift right immediate +define void @test_psrai_h(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrai_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psrai.h a1, a1, 2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %res = ashr <2 x i16> %a, splat(i16 2) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psrai_b(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrai_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psrai.b a1, a1, 2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %res = ashr <4 x i8> %a, splat(i8 2) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + ; Test logical shift left(scalar shamt) define void @test_psll_hs(ptr %ret_ptr, ptr %a_ptr, i16 %shamt) { ; CHECK-LABEL: test_psll_hs: @@ -746,3 +800,243 @@ define void @test_psll_bs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { store <4 x i8> %res, ptr %ret_ptr ret void } + +; Test logical shift right(scalar shamt) +define void @test_psrl_hs(ptr %ret_ptr, ptr %a_ptr, i16 %shamt) { +; CHECK-LABEL: test_psrl_hs: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psrl.hs a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %insert = insertelement <2 x i16> poison, i16 %shamt, i32 0 + %b = shufflevector <2 x i16> %insert, <2 x i16> poison, <2 x i32> zeroinitializer + %res = lshr <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psrl_bs(ptr %ret_ptr, ptr %a_ptr, i8 %shamt) { +; CHECK-LABEL: test_psrl_bs: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psrl.bs a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %insert = insertelement <4 x i8> poison, i8 %shamt, i32 0 + %b = shufflevector <4 x i8> %insert, <4 x i8> poison, <4 x i32> zeroinitializer + %res = lshr <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test arithmetic shift right(scalar shamt) +define void @test_psra_hs(ptr %ret_ptr, ptr %a_ptr, i16 %shamt) { +; CHECK-LABEL: test_psra_hs: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psra.hs a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %insert = insertelement <2 x i16> poison, i16 %shamt, i32 0 + %b = shufflevector <2 x i16> %insert, <2 x i16> poison, <2 x i32> zeroinitializer + %res = ashr <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psra_bs(ptr %ret_ptr, ptr %a_ptr, i8 %shamt) { +; CHECK-LABEL: test_psra_bs: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: psra.bs a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %insert = insertelement <4 x i8> poison, i8 %shamt, i32 0 + %b = shufflevector <4 x i8> %insert, <4 x i8> poison, <4 x i32> zeroinitializer + %res = ashr <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test logical shift right(vector shamt) +define void @test_psrl_hs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { +; CHECK-RV32-LABEL: test_psrl_hs_vec_shamt: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 0(a2) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: srli a3, a2, 16 +; CHECK-RV32-NEXT: srli a4, a1, 16 +; CHECK-RV32-NEXT: slli a1, a1, 16 +; CHECK-RV32-NEXT: srl a3, a4, a3 +; CHECK-RV32-NEXT: srli a1, a1, 16 +; CHECK-RV32-NEXT: srl a1, a1, a2 +; CHECK-RV32-NEXT: pack a1, a1, a3 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_psrl_hs_vec_shamt: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 0(a2) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: srli a3, a2, 16 +; CHECK-RV64-NEXT: srliw a4, a1, 16 +; CHECK-RV64-NEXT: slli a1, a1, 48 +; CHECK-RV64-NEXT: srl a3, a4, a3 +; CHECK-RV64-NEXT: srli a1, a1, 48 +; CHECK-RV64-NEXT: srl a1, a1, a2 +; CHECK-RV64-NEXT: ppaire.h a1, a1, a3 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %shamt_ptr + %res = lshr <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psrl_bs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { +; CHECK-RV32-LABEL: test_psrl_bs_vec_shamt: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 0(a2) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: srli a3, a2, 24 +; CHECK-RV32-NEXT: srli a4, a1, 24 +; CHECK-RV32-NEXT: srli a5, a2, 8 +; CHECK-RV32-NEXT: slli a6, a1, 16 +; CHECK-RV32-NEXT: srl a7, a4, a3 +; CHECK-RV32-NEXT: srli a3, a6, 24 +; CHECK-RV32-NEXT: srl a6, a3, a5 +; CHECK-RV32-NEXT: zext.b a3, a1 +; CHECK-RV32-NEXT: srli a4, a2, 16 +; CHECK-RV32-NEXT: slli a1, a1, 8 +; CHECK-RV32-NEXT: srl a2, a3, a2 +; CHECK-RV32-NEXT: srli a1, a1, 24 +; CHECK-RV32-NEXT: srl a3, a1, a4 +; CHECK-RV32-NEXT: ppaire.db a2, a2, a6 +; CHECK-RV32-NEXT: pack a1, a2, a3 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_psrl_bs_vec_shamt: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 0(a2) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: srli a3, a2, 24 +; CHECK-RV64-NEXT: srliw a4, a1, 24 +; CHECK-RV64-NEXT: srli a5, a2, 16 +; CHECK-RV64-NEXT: srl a3, a4, a3 +; CHECK-RV64-NEXT: slli a4, a1, 40 +; CHECK-RV64-NEXT: srli a4, a4, 56 +; CHECK-RV64-NEXT: srl a4, a4, a5 +; CHECK-RV64-NEXT: zext.b a5, a1 +; CHECK-RV64-NEXT: srl a5, a5, a2 +; CHECK-RV64-NEXT: srli a2, a2, 8 +; CHECK-RV64-NEXT: slli a1, a1, 48 +; CHECK-RV64-NEXT: srli a1, a1, 56 +; CHECK-RV64-NEXT: srl a1, a1, a2 +; CHECK-RV64-NEXT: ppaire.b a2, a4, a3 +; CHECK-RV64-NEXT: ppaire.b a1, a5, a1 +; CHECK-RV64-NEXT: ppaire.h a1, a1, a2 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %shamt_ptr + %res = lshr <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test arithmetic shift right(vector shamt) +define void @test_psra_hs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { +; CHECK-RV32-LABEL: test_psra_hs_vec_shamt: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 0(a2) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: srli a3, a2, 16 +; CHECK-RV32-NEXT: srai a4, a1, 16 +; CHECK-RV32-NEXT: slli a1, a1, 16 +; CHECK-RV32-NEXT: sra a3, a4, a3 +; CHECK-RV32-NEXT: srai a1, a1, 16 +; CHECK-RV32-NEXT: sra a1, a1, a2 +; CHECK-RV32-NEXT: pack a1, a1, a3 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_psra_hs_vec_shamt: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 0(a2) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: srli a3, a2, 16 +; CHECK-RV64-NEXT: sraiw a4, a1, 16 +; CHECK-RV64-NEXT: slli a1, a1, 48 +; CHECK-RV64-NEXT: sra a3, a4, a3 +; CHECK-RV64-NEXT: srai a1, a1, 48 +; CHECK-RV64-NEXT: sra a1, a1, a2 +; CHECK-RV64-NEXT: ppaire.h a1, a1, a3 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %shamt_ptr + %res = ashr <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psra_bs_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { +; CHECK-RV32-LABEL: test_psra_bs_vec_shamt: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 0(a2) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: srli a3, a2, 24 +; CHECK-RV32-NEXT: srai a4, a1, 24 +; CHECK-RV32-NEXT: srli a5, a2, 8 +; CHECK-RV32-NEXT: slli a6, a1, 16 +; CHECK-RV32-NEXT: sra a7, a4, a3 +; CHECK-RV32-NEXT: srai a3, a6, 24 +; CHECK-RV32-NEXT: sra a6, a3, a5 +; CHECK-RV32-NEXT: srli a3, a2, 16 +; CHECK-RV32-NEXT: slli a4, a1, 8 +; CHECK-RV32-NEXT: slli a1, a1, 24 +; CHECK-RV32-NEXT: srai a4, a4, 24 +; CHECK-RV32-NEXT: sra a3, a4, a3 +; CHECK-RV32-NEXT: srai a1, a1, 24 +; CHECK-RV32-NEXT: sra a2, a1, a2 +; CHECK-RV32-NEXT: ppaire.db a2, a2, a6 +; CHECK-RV32-NEXT: pack a1, a2, a3 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_psra_bs_vec_shamt: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 0(a2) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: srli a3, a2, 24 +; CHECK-RV64-NEXT: sraiw a4, a1, 24 +; CHECK-RV64-NEXT: srli a5, a2, 16 +; CHECK-RV64-NEXT: slli a6, a1, 40 +; CHECK-RV64-NEXT: sra a3, a4, a3 +; CHECK-RV64-NEXT: srli a4, a2, 8 +; CHECK-RV64-NEXT: srai a6, a6, 56 +; CHECK-RV64-NEXT: sra a5, a6, a5 +; CHECK-RV64-NEXT: slli a6, a1, 48 +; CHECK-RV64-NEXT: srai a6, a6, 56 +; CHECK-RV64-NEXT: sra a4, a6, a4 +; CHECK-RV64-NEXT: slli a1, a1, 56 +; CHECK-RV64-NEXT: srai a1, a1, 56 +; CHECK-RV64-NEXT: sra a1, a1, a2 +; CHECK-RV64-NEXT: ppaire.b a2, a5, a3 +; CHECK-RV64-NEXT: ppaire.b a1, a1, a4 +; CHECK-RV64-NEXT: ppaire.h a1, a1, a2 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %shamt_ptr + %res = ashr <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll index c7fb891cdd996..3e0f431d67f41 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -791,6 +791,86 @@ define void @test_pslli_w(ptr %ret_ptr, ptr %a_ptr) { store <2 x i32> %res, ptr %ret_ptr ret void } +; Test logical shift right immediate +define void @test_psrli_w(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrli_w: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrli.w a1, a1, 2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %res = lshr <2 x i32> %a, splat(i32 2) + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +define void @test_psrli_h(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrli.h a1, a1, 2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %res = lshr <4 x i16> %a, splat(i16 2) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psrli_b(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrli.b a1, a1, 2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %res = lshr <8 x i8> %a, splat(i8 2) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test arithmetic shift right immediate +define void @test_psrai_w(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrai_w: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrai.w a1, a1, 2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %res = ashr <2 x i32> %a, splat(i32 2) + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +define void @test_psrai_h(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrai_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrai.h a1, a1, 2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %res = ashr <4 x i16> %a, splat(i16 2) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psrai_b(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_psrai_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrai.b a1, a1, 2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %res = ashr <8 x i8> %a, splat(i8 2) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + ; Test arithmetic saturation shift left immediate for v2i32 define void @test_psslai_w(ptr %ret_ptr, ptr %a_ptr) { @@ -841,3 +921,75 @@ define void @test_psll_ws_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { store <2 x i32> %res, ptr %ret_ptr ret void } + +; Test logical shift right(scalar shamt) +define void @test_psrl_ws(ptr %ret_ptr, ptr %a_ptr, i32 %shamt) { +; CHECK-LABEL: test_psrl_ws: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psrl.ws a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %insert = insertelement <2 x i32> poison, i32 %shamt, i32 0 + %b = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer + %res = lshr <2 x i32> %a, %b + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +; Test arithmetic shift right(scalar shamt) +define void @test_psra_ws(ptr %ret_ptr, ptr %a_ptr, i32 %shamt) { +; CHECK-LABEL: test_psra_ws: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: psra.ws a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %insert = insertelement <2 x i32> poison, i32 %shamt, i32 0 + %b = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer + %res = ashr <2 x i32> %a, %b + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +; Test logical shift right(vector shamt) +define void @test_psrl_ws_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { +; CHECK-LABEL: test_psrl_ws_vec_shamt: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: srlw a3, a1, a2 +; CHECK-NEXT: srli a2, a2, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: srlw a1, a1, a2 +; CHECK-NEXT: pack a1, a3, a1 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %b = load <2 x i32>, ptr %shamt_ptr + %res = lshr <2 x i32> %a, %b + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +; Test arithmetic shift right(vector shamt) +define void @test_psra_ws_vec_shamt(ptr %ret_ptr, ptr %a_ptr, ptr %shamt_ptr) { +; CHECK-LABEL: test_psra_ws_vec_shamt: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: sraw a3, a1, a2 +; CHECK-NEXT: srli a2, a2, 32 +; CHECK-NEXT: srli a1, a1, 32 +; CHECK-NEXT: sraw a1, a1, a2 +; CHECK-NEXT: pack a1, a3, a1 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %b = load <2 x i32>, ptr %shamt_ptr + %res = ashr <2 x i32> %a, %b + store <2 x i32> %res, ptr %ret_ptr + ret void +} From f57abf519b8e3a58182a442a78a09de91105680c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Thu, 11 Dec 2025 09:17:27 +0100 Subject: [PATCH 29/49] [SPIRV] Promote scalar arguments to vector for `OpExtInst` in `generateExtInst` instead of `SPIRVRegularizer` (#170155) This patch consist of 2 parts: * A first part that removes the scalar to vector promotion for built-ins in the `SPIRVRegularizer`; * and a second part that implements the promotion for built-ins from scalar to vector in `generateExtInst`. The implementation in `SPIRVRegularizer` had several issues: * It rolled its own built-in pattern matching that was extremely permissive * the compiler would crash if the built-in had a definition * the compiler would crash if the built-in had no arguments * The compiler would crash if there were more than 2 function definitions in the module. * It'd be better if this was implemented as a module pass; where we iterate over the users of the function, instead of scanning the whole module for callers. This patch does the scalar to vector promotion just before the `OpExtInst` is generated. Without relying on the IR transformation. One change in the generated code from the previous implementation is that this version uses a single `OpCompositeConstruct` operation to convert the scalar into a vector. The old implementation inserted an element at the 0 position in an `undef` vector (using `OpCompositeInsert`); then copied that element for every vector element using `OpVectorShuffle`. This patch also adds a test (`OpExtInst_vector_promotion_bug.ll`) that highlights an issue in the builtin pattern matching that we're using: our pattern matching doesn't consider the number of arguments, only the demangled name, first and last arguments (`min(int,int,int)` matches the same builtin as `min(int, int)`). --- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 62 +++++- llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp | 102 +--------- .../transcoding/OpExtInst_vector_promotion.ll | 179 ++++++++++++++++++ .../OpExtInst_vector_promotion_bug.ll | 21 ++ llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll | 16 -- 5 files changed, 263 insertions(+), 117 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll delete mode 100644 llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index e5ba0201c0cc1..b111909fc25cc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1154,10 +1154,63 @@ static unsigned getNumSizeComponents(SPIRVType *imgType) { return arrayed ? numComps + 1 : numComps; } +static bool builtinMayNeedPromotionToVec(uint32_t BuiltinNumber) { + switch (BuiltinNumber) { + case SPIRV::OpenCLExtInst::s_min: + case SPIRV::OpenCLExtInst::u_min: + case SPIRV::OpenCLExtInst::s_max: + case SPIRV::OpenCLExtInst::u_max: + case SPIRV::OpenCLExtInst::fmax: + case SPIRV::OpenCLExtInst::fmin: + case SPIRV::OpenCLExtInst::fmax_common: + case SPIRV::OpenCLExtInst::fmin_common: + case SPIRV::OpenCLExtInst::s_clamp: + case SPIRV::OpenCLExtInst::fclamp: + case SPIRV::OpenCLExtInst::u_clamp: + case SPIRV::OpenCLExtInst::mix: + case SPIRV::OpenCLExtInst::step: + case SPIRV::OpenCLExtInst::smoothstep: + return true; + default: + break; + } + return false; +} + //===----------------------------------------------------------------------===// // Implementation functions for each builtin group //===----------------------------------------------------------------------===// +static SmallVector +getBuiltinCallArguments(const SPIRV::IncomingCall *Call, uint32_t BuiltinNumber, + MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { + + Register ReturnTypeId = GR->getSPIRVTypeID(Call->ReturnType); + unsigned ResultElementCount = + GR->getScalarOrVectorComponentCount(ReturnTypeId); + bool MayNeedPromotionToVec = + builtinMayNeedPromotionToVec(BuiltinNumber) && ResultElementCount > 1; + + if (!MayNeedPromotionToVec) + return {Call->Arguments.begin(), Call->Arguments.end()}; + + SmallVector Arguments; + for (Register Argument : Call->Arguments) { + Register VecArg = Argument; + SPIRVType *ArgumentType = GR->getSPIRVTypeForVReg(Argument); + if (ArgumentType != Call->ReturnType) { + VecArg = createVirtualRegister(Call->ReturnType, GR, MIRBuilder); + auto VecSplat = MIRBuilder.buildInstr(SPIRV::OpCompositeConstruct) + .addDef(VecArg) + .addUse(ReturnTypeId); + for (unsigned I = 0; I != ResultElementCount; ++I) + VecSplat.addUse(Argument); + } + Arguments.push_back(VecArg); + } + return Arguments; +} + static bool generateExtInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR, const CallBase &CB) { @@ -1179,16 +1232,21 @@ static bool generateExtInst(const SPIRV::IncomingCall *Call, : SPIRV::OpenCLExtInst::fmax; } + Register ReturnTypeId = GR->getSPIRVTypeID(Call->ReturnType); + SmallVector Arguments = + getBuiltinCallArguments(Call, Number, MIRBuilder, GR); + // Build extended instruction. auto MIB = MIRBuilder.buildInstr(SPIRV::OpExtInst) .addDef(Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(ReturnTypeId) .addImm(static_cast(SPIRV::InstructionSet::OpenCL_std)) .addImm(Number); - for (auto Argument : Call->Arguments) + for (Register Argument : Arguments) MIB.addUse(Argument); + MIB.getInstr()->copyIRFlags(CB); if (OrigNumber == SPIRV::OpenCLExtInst::fmin_common || OrigNumber == SPIRV::OpenCLExtInst::fmax_common) { diff --git a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp index 1b95f09974c61..653c9ad53e888 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp @@ -12,11 +12,10 @@ //===----------------------------------------------------------------------===// #include "SPIRV.h" -#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" -#include "llvm/Transforms/Utils/Cloning.h" #include @@ -25,9 +24,7 @@ using namespace llvm; namespace { -struct SPIRVRegularizer : public FunctionPass, InstVisitor { - DenseMap Old2NewFuncs; - +struct SPIRVRegularizer : public FunctionPass { public: static char ID; SPIRVRegularizer() : FunctionPass(ID) {} @@ -37,11 +34,8 @@ struct SPIRVRegularizer : public FunctionPass, InstVisitor { void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); } - void visitCallInst(CallInst &CI); private: - void visitCallScalToVec(CallInst *CI, StringRef MangledName, - StringRef DemangledName); void runLowerConstExpr(Function &F); }; } // namespace @@ -157,98 +151,8 @@ void SPIRVRegularizer::runLowerConstExpr(Function &F) { } } -// It fixes calls to OCL builtins that accept vector arguments and one of them -// is actually a scalar splat. -void SPIRVRegularizer::visitCallInst(CallInst &CI) { - auto F = CI.getCalledFunction(); - if (!F) - return; - - auto MangledName = F->getName(); - char *NameStr = itaniumDemangle(F->getName().data()); - if (!NameStr) - return; - StringRef DemangledName(NameStr); - - // TODO: add support for other builtins. - if (DemangledName.starts_with("fmin") || DemangledName.starts_with("fmax") || - DemangledName.starts_with("min") || DemangledName.starts_with("max")) - visitCallScalToVec(&CI, MangledName, DemangledName); - free(NameStr); -} - -void SPIRVRegularizer::visitCallScalToVec(CallInst *CI, StringRef MangledName, - StringRef DemangledName) { - // Check if all arguments have the same type - it's simple case. - auto Uniform = true; - Type *Arg0Ty = CI->getOperand(0)->getType(); - auto IsArg0Vector = isa(Arg0Ty); - for (unsigned I = 1, E = CI->arg_size(); Uniform && (I != E); ++I) - Uniform = isa(CI->getOperand(I)->getType()) == IsArg0Vector; - if (Uniform) - return; - - auto *OldF = CI->getCalledFunction(); - Function *NewF = nullptr; - auto [It, Inserted] = Old2NewFuncs.try_emplace(OldF); - if (Inserted) { - AttributeList Attrs = CI->getCalledFunction()->getAttributes(); - SmallVector ArgTypes = {OldF->getArg(0)->getType(), Arg0Ty}; - auto *NewFTy = - FunctionType::get(OldF->getReturnType(), ArgTypes, OldF->isVarArg()); - NewF = Function::Create(NewFTy, OldF->getLinkage(), OldF->getName(), - *OldF->getParent()); - ValueToValueMapTy VMap; - auto NewFArgIt = NewF->arg_begin(); - for (auto &Arg : OldF->args()) { - auto ArgName = Arg.getName(); - NewFArgIt->setName(ArgName); - VMap[&Arg] = &(*NewFArgIt++); - } - SmallVector Returns; - CloneFunctionInto(NewF, OldF, VMap, - CloneFunctionChangeType::LocalChangesOnly, Returns); - NewF->setAttributes(Attrs); - It->second = NewF; - } else { - NewF = It->second; - } - assert(NewF); - - // This produces an instruction sequence that implements a splat of - // CI->getOperand(1) to a vector Arg0Ty. However, we use InsertElementInst - // and ShuffleVectorInst to generate the same code as the SPIR-V translator. - // For instance (transcoding/OpMin.ll), this call - // call spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32> , i32 5) - // is translated to - // %8 = OpUndef %v2uint - // %14 = OpConstantComposite %v2uint %uint_1 %uint_10 - // ... - // %10 = OpCompositeInsert %v2uint %uint_5 %8 0 - // %11 = OpVectorShuffle %v2uint %10 %8 0 0 - // %call = OpExtInst %v2uint %1 s_min %14 %11 - auto ConstInt = ConstantInt::get(IntegerType::get(CI->getContext(), 32), 0); - PoisonValue *PVal = PoisonValue::get(Arg0Ty); - Instruction *Inst = InsertElementInst::Create( - PVal, CI->getOperand(1), ConstInt, "", CI->getIterator()); - ElementCount VecElemCount = cast(Arg0Ty)->getElementCount(); - Constant *ConstVec = ConstantVector::getSplat(VecElemCount, ConstInt); - Value *NewVec = - new ShuffleVectorInst(Inst, PVal, ConstVec, "", CI->getIterator()); - CI->setOperand(1, NewVec); - CI->replaceUsesOfWith(OldF, NewF); - CI->mutateFunctionType(NewF->getFunctionType()); -} - bool SPIRVRegularizer::runOnFunction(Function &F) { runLowerConstExpr(F); - visit(F); - for (auto &OldNew : Old2NewFuncs) { - Function *OldF = OldNew.first; - Function *NewF = OldNew.second; - NewF->takeName(OldF); - OldF->eraseFromParent(); - } return true; } diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll new file mode 100644 index 0000000000000..b406f8b71f7e6 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion.ll @@ -0,0 +1,179 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown < %s | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown < %s -filetype=obj | spirv-val %} +; +; Some OpenCL builtins have mixed vector-scalar variants, but OpExtInt only supports +; versions where all the arguments have the same type. +; +; We generate code, but it is invalid. +; We should generate vector versions for these cases. + +define spir_kernel void @S_MIN() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function S_MIN +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} s_min %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32> , i32 5) + ret void +} + +define spir_kernel void @U_MIN() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function U_MIN +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} u_min %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x i32> @_Z3minDv2_jj(<2 x i32> , i32 5) + ret void +} + +define spir_kernel void @S_MAX() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function S_MAX +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} s_max %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x i32> @_Z3maxDv2_ii(<2 x i32> , i32 5) + ret void +} + +define spir_kernel void @F_MIN() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_MIN +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmin %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z3minDv2_ff(<2 x float> , float 5.0) + ret void +} + +define spir_kernel void @F_MAX() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_MAX +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmax %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z3maxDv2_ff(<2 x float> , float 5.0) + ret void +} + +define spir_kernel void @F_FMIN() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_FMIN +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmin %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z4fminDv2_ff(<2 x float> , float 5.0) + ret void +} + +define spir_kernel void @F_FMAX() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_FMAX +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fmax %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float> , float 5.0) + ret void +} + +define spir_kernel void @S_CLAMP() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function S_CLAMP +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC_0:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_0:[0-9]+]] %[[SCALAR_0]] +; CHECK-NEXT: %[[VEC_1:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_1:[0-9]+]] %[[SCALAR_1]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} s_clamp %{{[0-9]+}} %[[VEC_0]] %[[VEC_1]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x i32> @_Z5clampDv2_iii(<2 x i32> , i32 5, i32 6) + ret void +} + +define spir_kernel void @F_CLAMP() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function F_CLAMP +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC_0:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_0:[0-9]+]] %[[SCALAR_0]] +; CHECK-NEXT: %[[VEC_1:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_1:[0-9]+]] %[[SCALAR_1]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} fclamp %{{[0-9]+}} %[[VEC_0]] %[[VEC_1]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z5clampDv2_fff(<2 x float> , float 5.0, float 6.0) + ret void +} + +define spir_kernel void @MIX() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function MIX +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR:[0-9]+]] %[[SCALAR]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} mix %{{[0-9]+}} %{{[0-9]+}} %[[VEC]] +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z3mixDv2_fS_f(<2 x float> , <2 x float> , float 0.5) + ret void +} + +define spir_kernel void @SMOOTHSTEP() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function SMOOTHSTEP +; CHECK-NEXT: OpLabel +; CHECK-NEXT: %[[VEC_0:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_0:[0-9]+]] %[[SCALAR_0]] +; CHECK-NEXT: %[[VEC_1:[0-9]+]] = OpCompositeConstruct %[[VECTYPE:[0-9]+]] %[[SCALAR_1:[0-9]+]] %[[SCALAR_1]] +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %[[VECTYPE]] %{{[0-9]+}} smoothstep %[[VEC_0]] %[[VEC_1]] %{{[0-9]+}} +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + %call = tail call spir_func <2 x float> @_Z10smoothstepffDv2_f(float 1.0, float 0.5, <2 x float> ) + ret void +} + +define spir_kernel void @ill_0() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function ill_0 +; CHECK-NEXT: OpLabel +; CHECK-NEXT: OpFunctionCall %{{[0-9]+}} %{{[0-9]+}} +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + tail call spir_func void @_Z3minv() + ret void +} + +declare spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32>, i32) +declare spir_func <2 x i32> @_Z3minDv2_jj(<2 x i32>, i32) +declare spir_func <2 x i32> @_Z3maxDv2_ii(<2 x i32>, i32) +declare spir_func <2 x float> @_Z3minDv2_ff(<2 x float>, float) +declare spir_func <2 x float> @_Z3maxDv2_ff(<2 x float>, float) +declare spir_func <2 x float> @_Z4fminDv2_ff(<2 x float>, float) +declare spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float>, float) +declare spir_func <2 x i32> @_Z5clampDv2_iii(<2 x i32>, i32) +declare spir_func <2 x float> @_Z5clampDv2_fff(<2 x float>, float) +declare spir_func <2 x float> @_Z3mixDv2_fS_f(<2 x float>, <2 x float>, float) +declare spir_func <2 x float> @_Z10smoothstepffDv2_f(float, float, <2 x float>) +declare spir_func void @_Z3minv() diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll new file mode 100644 index 0000000000000..b81f373be33c3 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpExtInst_vector_promotion_bug.ll @@ -0,0 +1,21 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown < %s | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown < %s -filetype=obj | not spirv-val 2>&1 | FileCheck %s --check-prefix=VALIDATOR %} +; +; _Z3miniii is not a valid OpenCL intrinsic, do not treat it like one. +; +; VALIDATOR: Invalid instruction OpExtInst starting at word {{[0-9]+}}: expected no more operands after 7 words, but stated word count is 8 + +define spir_kernel void @ill_1() { +; CHECK-LABEL: OpFunction %{{[0-9]+}} None %{{[0-9]+}} ; -- Begin function ill_1 +; CHECK-NEXT: OpLabel +; This is wrong, we should generate a regular call +; CHECK-NEXT: %{{[0-9]+}} = OpExtInst %{{[0-9]+}} %{{[0-9]+}} s_min %{{[0-9]+}} %{{[0-9]+}} %{{[0-9]+}} +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd +; CHECK-NEXT: ; -- End function +entry: + tail call spir_func void @_Z3miniii(i32 1, i32 2, i32 3) + ret void +} + +declare spir_func i32 @_Z3miniii(i32, i32, i32) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll deleted file mode 100644 index 5cc3ea01e5191..0000000000000 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpMin.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV - -; CHECK-SPIRV: %[[#SetInstID:]] = OpExtInstImport "OpenCL.std" -; CHECK-SPIRV: %[[#IntTypeID:]] = OpTypeInt 32 [[#]] -; CHECK-SPIRV: %[[#Int2TypeID:]] = OpTypeVector %[[#IntTypeID]] 2 -; CHECK-SPIRV: %[[#CompositeID:]] = OpCompositeInsert %[[#Int2TypeID]] %[[#]] %[[#]] [[#]] -; CHECK-SPIRV: %[[#ShuffleID:]] = OpVectorShuffle %[[#Int2TypeID]] %[[#CompositeID]] %[[#]] [[#]] [[#]] -; CHECK-SPIRV: %[[#]] = OpExtInst %[[#Int2TypeID]] %[[#SetInstID]] s_min %[[#]] %[[#ShuffleID]] - -define spir_kernel void @test() { -entry: - %call = tail call spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32> , i32 5) #2 - ret void -} - -declare spir_func <2 x i32> @_Z3minDv2_ii(<2 x i32>, i32) From 15df9e701f1f1194a25e6123612cc735ad392ae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Thu, 11 Dec 2025 09:17:43 +0100 Subject: [PATCH 30/49] [AMDGPU][SDAG] Add missing cases for SI_INDIRECT_SRC/DST (#170323) Before this patch, `insertelement/extractelement` with dynamic indices would fail to select with `-O0` for vector 32-bit element types with sizes 3, 5, 6 and 7, which did not map to a `SI_INDIRECT_SRC/DST` pattern. Other "weird" sizes bigger than 8 (like 13) are properly handled already. To solve this issue we add the missing patterns for the problematic sizes. Solves SWDEV-568862 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 + llvm/lib/Target/AMDGPU/SIInstructions.td | 16 + .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 3310 +++++++++ .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 5963 +++++++++++++++++ 4 files changed, 9297 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4651d7d9d3adf..76bbb30b85a78 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6304,7 +6304,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V3: case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V5: + case AMDGPU::SI_INDIRECT_SRC_V6: + case AMDGPU::SI_INDIRECT_SRC_V7: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V9: case AMDGPU::SI_INDIRECT_SRC_V10: @@ -6315,7 +6319,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitIndirectSrc(MI, *BB, *getSubtarget()); case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V3: case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V5: + case AMDGPU::SI_INDIRECT_DST_V6: + case AMDGPU::SI_INDIRECT_DST_V7: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V9: case AMDGPU::SI_INDIRECT_DST_V10: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 984d1a4db4cd6..643b2463344e5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -969,7 +969,11 @@ class SI_INDIRECT_DST : VPseudoInstSI < def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V3 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V5 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V6 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V7 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC; @@ -980,7 +984,11 @@ def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V3 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V5 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V6 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V7 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST; @@ -2779,7 +2787,11 @@ multiclass SI_INDIRECT_Pattern { } defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern; +defm : SI_INDIRECT_Pattern; +defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; @@ -2789,7 +2801,11 @@ defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern; +defm : SI_INDIRECT_Pattern; +defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index c69b0cce3d208..4b340f308d5f6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_extelt: @@ -20,6 +21,30 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float4_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 4.0 +; GCN-O0-NEXT: s_mov_b32 s4, 2.0 +; GCN-O0-NEXT: s_mov_b32 s5, 1.0 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -43,6 +68,30 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: int4_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 4 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: s_mov_b32 s5, 1 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x i32> , i32 %sel store i32 %ext, ptr addrspace(1) %out @@ -72,6 +121,72 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double4_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x40100a3d +; GCN-O0-NEXT: s_mov_b32 s4, 0x70a3d70a +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s12, s4 +; GCN-O0-NEXT: s_mov_b32 s6, 0x4000147a +; GCN-O0-NEXT: s_mov_b32 s4, 0xe147ae14 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s13, s5 +; GCN-O0-NEXT: s_mov_b32 s14, s4 +; GCN-O0-NEXT: s_mov_b32 s6, 0x3ff028f5 +; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s15, s5 +; GCN-O0-NEXT: s_mov_b32 s16, s4 +; GCN-O0-NEXT: s_mov_b32 s6, 0x3f847ae1 +; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s17, s5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s16 +; GCN-O0-NEXT: s_mov_b32 s7, s15 +; GCN-O0-NEXT: s_mov_b32 s8, s14 +; GCN-O0-NEXT: s_mov_b32 s9, s13 +; GCN-O0-NEXT: s_mov_b32 s10, s12 +; GCN-O0-NEXT: s_mov_b32 s11, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -109,6 +224,113 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double5_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x40140a3d +; GCN-O0-NEXT: s_mov_b32 s4, 0x70a3d70a +; GCN-O0-NEXT: s_mov_b32 s6, s4 +; GCN-O0-NEXT: s_mov_b32 s7, s3 +; GCN-O0-NEXT: s_mov_b32 s25, s7 +; GCN-O0-NEXT: s_mov_b32 s26, s6 +; GCN-O0-NEXT: s_mov_b32 s3, 0x40100a3d +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s27, s5 +; GCN-O0-NEXT: s_mov_b32 s28, s4 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4000147a +; GCN-O0-NEXT: s_mov_b32 s4, 0xe147ae14 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s29, s5 +; GCN-O0-NEXT: s_mov_b32 s30, s4 +; GCN-O0-NEXT: s_mov_b32 s3, 0x3ff028f5 +; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s31, s5 +; GCN-O0-NEXT: s_mov_b32 s33, s4 +; GCN-O0-NEXT: s_mov_b32 s3, 0x3f847ae1 +; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s34, s5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GCN-O0-NEXT: ; implicit-def: $sgpr24 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr23 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr22 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr21 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr20 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s5, s34 +; GCN-O0-NEXT: s_mov_b32 s6, s33 +; GCN-O0-NEXT: s_mov_b32 s7, s31 +; GCN-O0-NEXT: s_mov_b32 s8, s30 +; GCN-O0-NEXT: s_mov_b32 s9, s29 +; GCN-O0-NEXT: s_mov_b32 s10, s28 +; GCN-O0-NEXT: s_mov_b32 s11, s27 +; GCN-O0-NEXT: s_mov_b32 s12, s26 +; GCN-O0-NEXT: s_mov_b32 s13, s25 +; GCN-O0-NEXT: s_mov_b32 s14, s24 +; GCN-O0-NEXT: s_mov_b32 s15, s23 +; GCN-O0-NEXT: s_mov_b32 s16, s22 +; GCN-O0-NEXT: s_mov_b32 s17, s21 +; GCN-O0-NEXT: s_mov_b32 s18, s20 +; GCN-O0-NEXT: s_mov_b32 s19, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <5 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -130,6 +352,25 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: half4_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s5, 0x44004200 +; GCN-O0-NEXT: s_mov_b32 s0, 0x40003c00 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s5, 4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 +; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_short v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x half> , i32 %sel store half %ext, ptr addrspace(1) %out @@ -149,6 +390,24 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float2_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 1.0 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <2 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -172,6 +431,48 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double2_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x3ff028f5 +; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s8, s4 +; GCN-O0-NEXT: s_mov_b32 s6, 0x3f847ae1 +; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s9, s5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s5, s9 +; GCN-O0-NEXT: s_mov_b32 s6, s8 +; GCN-O0-NEXT: s_mov_b32 s7, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <2 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -217,6 +518,60 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: half8_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4000 +; GCN-O0-NEXT: s_mov_b32 s6, 0x3c00 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s3, 2 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4200 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s3, 3 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4400 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s3, 4 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4500 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s3, 5 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4600 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s3, 6 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s3, 0x4700 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GCN-O0-NEXT: s_mov_b32 s3, 7 +; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-O0-NEXT: s_mov_b32 s4, 0x4800 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3] +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_short v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x half> , i32 %sel store half %ext, ptr addrspace(1) %out @@ -248,6 +603,39 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: short8_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1 +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 3 +; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 4 +; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 5 +; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 6 +; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 7 +; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 +; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s0, 8 +; GCN-O0-NEXT: s_cselect_b32 s0, s0, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_short v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x i16> , i32 %sel store i16 %ext, ptr addrspace(1) %out @@ -274,6 +662,42 @@ define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float8_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x41000000 +; GCN-O0-NEXT: s_mov_b32 s4, 0x40e00000 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40c00000 +; GCN-O0-NEXT: s_mov_b32 s6, 0x40a00000 +; GCN-O0-NEXT: s_mov_b32 s7, 4.0 +; GCN-O0-NEXT: s_mov_b32 s8, 0x40400000 +; GCN-O0-NEXT: s_mov_b32 s9, 2.0 +; GCN-O0-NEXT: s_mov_b32 s10, 1.0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s3 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v8 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -325,6 +749,101 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[15:16] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double8_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40200000 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s20, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x401c0000 +; GCN-O0-NEXT: s_mov_b32 s21, s5 +; GCN-O0-NEXT: s_mov_b32 s22, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40180000 +; GCN-O0-NEXT: s_mov_b32 s23, s5 +; GCN-O0-NEXT: s_mov_b32 s24, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40140000 +; GCN-O0-NEXT: s_mov_b32 s25, s5 +; GCN-O0-NEXT: s_mov_b32 s26, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 4.0 +; GCN-O0-NEXT: s_mov_b32 s27, s5 +; GCN-O0-NEXT: s_mov_b32 s28, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40080000 +; GCN-O0-NEXT: s_mov_b32 s29, s5 +; GCN-O0-NEXT: s_mov_b32 s30, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 2.0 +; GCN-O0-NEXT: s_mov_b32 s31, s5 +; GCN-O0-NEXT: s_mov_b32 s33, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s34, s5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s5, s34 +; GCN-O0-NEXT: s_mov_b32 s6, s33 +; GCN-O0-NEXT: s_mov_b32 s7, s31 +; GCN-O0-NEXT: s_mov_b32 s8, s30 +; GCN-O0-NEXT: s_mov_b32 s9, s29 +; GCN-O0-NEXT: s_mov_b32 s10, s28 +; GCN-O0-NEXT: s_mov_b32 s11, s27 +; GCN-O0-NEXT: s_mov_b32 s12, s26 +; GCN-O0-NEXT: s_mov_b32 s13, s25 +; GCN-O0-NEXT: s_mov_b32 s14, s24 +; GCN-O0-NEXT: s_mov_b32 s15, s23 +; GCN-O0-NEXT: s_mov_b32 s16, s22 +; GCN-O0-NEXT: s_mov_b32 s17, s21 +; GCN-O0-NEXT: s_mov_b32 s18, s20 +; GCN-O0-NEXT: s_mov_b32 s19, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -374,6 +893,101 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[15:16] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double7_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x401c0000 +; GCN-O0-NEXT: s_mov_b32 s21, s5 +; GCN-O0-NEXT: s_mov_b32 s22, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40180000 +; GCN-O0-NEXT: s_mov_b32 s23, s5 +; GCN-O0-NEXT: s_mov_b32 s24, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40140000 +; GCN-O0-NEXT: s_mov_b32 s25, s5 +; GCN-O0-NEXT: s_mov_b32 s26, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 4.0 +; GCN-O0-NEXT: s_mov_b32 s27, s5 +; GCN-O0-NEXT: s_mov_b32 s28, s4 +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40080000 +; GCN-O0-NEXT: s_mov_b32 s29, s5 +; GCN-O0-NEXT: s_mov_b32 s30, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 2.0 +; GCN-O0-NEXT: s_mov_b32 s31, s5 +; GCN-O0-NEXT: s_mov_b32 s33, s4 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s34, s5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GCN-O0-NEXT: ; implicit-def: $sgpr20 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s5, s34 +; GCN-O0-NEXT: s_mov_b32 s6, s33 +; GCN-O0-NEXT: s_mov_b32 s7, s31 +; GCN-O0-NEXT: s_mov_b32 s8, s30 +; GCN-O0-NEXT: s_mov_b32 s9, s29 +; GCN-O0-NEXT: s_mov_b32 s10, s28 +; GCN-O0-NEXT: s_mov_b32 s11, s27 +; GCN-O0-NEXT: s_mov_b32 s12, s26 +; GCN-O0-NEXT: s_mov_b32 s13, s25 +; GCN-O0-NEXT: s_mov_b32 s14, s24 +; GCN-O0-NEXT: s_mov_b32 s15, s23 +; GCN-O0-NEXT: s_mov_b32 s16, s22 +; GCN-O0-NEXT: s_mov_b32 s17, s21 +; GCN-O0-NEXT: s_mov_b32 s18, s20 +; GCN-O0-NEXT: s_mov_b32 s19, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <7 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -408,6 +1022,66 @@ define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float16_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x41800000 +; GCN-O0-NEXT: s_mov_b32 s4, 0x41700000 +; GCN-O0-NEXT: s_mov_b32 s5, 0x41600000 +; GCN-O0-NEXT: s_mov_b32 s6, 0x41500000 +; GCN-O0-NEXT: s_mov_b32 s7, 0x41400000 +; GCN-O0-NEXT: s_mov_b32 s8, 0x41300000 +; GCN-O0-NEXT: s_mov_b32 s9, 0x41200000 +; GCN-O0-NEXT: s_mov_b32 s10, 0x41100000 +; GCN-O0-NEXT: s_mov_b32 s11, 0x41000000 +; GCN-O0-NEXT: s_mov_b32 s12, 0x40e00000 +; GCN-O0-NEXT: s_mov_b32 s13, 0x40c00000 +; GCN-O0-NEXT: s_mov_b32 s14, 0x40a00000 +; GCN-O0-NEXT: s_mov_b32 s15, 4.0 +; GCN-O0-NEXT: s_mov_b32 s16, 0x40400000 +; GCN-O0-NEXT: s_mov_b32 s17, 2.0 +; GCN-O0-NEXT: s_mov_b32 s18, 1.0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s3 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v16 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <16 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -489,6 +1163,181 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double15_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s7, 0x402e0000 +; GCN-O0-NEXT: s_mov_b32 s5, s7 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s8, 0 +; GCN-O0-NEXT: s_mov_b32 s9, 0x402c0000 +; GCN-O0-NEXT: s_mov_b32 s7, s9 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s10, 0 +; GCN-O0-NEXT: s_mov_b32 s11, 0x402a0000 +; GCN-O0-NEXT: s_mov_b32 s9, s11 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s12, 0 +; GCN-O0-NEXT: s_mov_b32 s13, 0x40280000 +; GCN-O0-NEXT: s_mov_b32 s11, s13 +; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 +; GCN-O0-NEXT: s_mov_b32 s14, 0 +; GCN-O0-NEXT: s_mov_b32 s15, 0x40260000 +; GCN-O0-NEXT: s_mov_b32 s13, s15 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 killed $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s16, 0 +; GCN-O0-NEXT: s_mov_b32 s17, 0x40240000 +; GCN-O0-NEXT: s_mov_b32 s15, s17 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s18, 0 +; GCN-O0-NEXT: s_mov_b32 s19, 0x40220000 +; GCN-O0-NEXT: s_mov_b32 s17, s19 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 killed $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s20, 0 +; GCN-O0-NEXT: s_mov_b32 s21, 0x40200000 +; GCN-O0-NEXT: s_mov_b32 s19, s21 +; GCN-O0-NEXT: ; kill: def $sgpr20 killed $sgpr20 killed $sgpr20_sgpr21 +; GCN-O0-NEXT: s_mov_b32 s22, 0 +; GCN-O0-NEXT: s_mov_b32 s23, 0x401c0000 +; GCN-O0-NEXT: s_mov_b32 s21, s23 +; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 killed $sgpr22_sgpr23 +; GCN-O0-NEXT: s_mov_b32 s24, 0 +; GCN-O0-NEXT: s_mov_b32 s25, 0x40180000 +; GCN-O0-NEXT: s_mov_b32 s23, s25 +; GCN-O0-NEXT: ; kill: def $sgpr24 killed $sgpr24 killed $sgpr24_sgpr25 +; GCN-O0-NEXT: s_mov_b32 s26, 0 +; GCN-O0-NEXT: s_mov_b32 s27, 0x40140000 +; GCN-O0-NEXT: s_mov_b32 s25, s27 +; GCN-O0-NEXT: ; kill: def $sgpr26 killed $sgpr26 killed $sgpr26_sgpr27 +; GCN-O0-NEXT: s_mov_b64 s[28:29], 4.0 +; GCN-O0-NEXT: s_mov_b32 s27, s29 +; GCN-O0-NEXT: ; kill: def $sgpr28 killed $sgpr28 killed $sgpr28_sgpr29 +; GCN-O0-NEXT: s_mov_b32 s30, 0 +; GCN-O0-NEXT: s_mov_b32 s31, 0x40080000 +; GCN-O0-NEXT: s_mov_b32 s29, s31 +; GCN-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 killed $sgpr30_sgpr31 +; GCN-O0-NEXT: s_mov_b64 s[34:35], 2.0 +; GCN-O0-NEXT: s_mov_b32 s31, s35 +; GCN-O0-NEXT: s_mov_b32 s33, s34 +; GCN-O0-NEXT: s_mov_b64 s[36:37], 1.0 +; GCN-O0-NEXT: s_mov_b32 s34, s37 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37 +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr3 +; GCN-O0-NEXT: ; implicit-def: $sgpr35 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GCN-O0-NEXT: s_mov_b32 s37, s34 +; GCN-O0-NEXT: s_mov_b32 s38, s33 +; GCN-O0-NEXT: s_mov_b32 s39, s31 +; GCN-O0-NEXT: s_mov_b32 s40, s30 +; GCN-O0-NEXT: s_mov_b32 s41, s29 +; GCN-O0-NEXT: s_mov_b32 s42, s28 +; GCN-O0-NEXT: s_mov_b32 s43, s27 +; GCN-O0-NEXT: s_mov_b32 s44, s26 +; GCN-O0-NEXT: s_mov_b32 s45, s25 +; GCN-O0-NEXT: s_mov_b32 s46, s24 +; GCN-O0-NEXT: s_mov_b32 s47, s23 +; GCN-O0-NEXT: s_mov_b32 s48, s22 +; GCN-O0-NEXT: s_mov_b32 s49, s21 +; GCN-O0-NEXT: s_mov_b32 s50, s20 +; GCN-O0-NEXT: s_mov_b32 s51, s19 +; GCN-O0-NEXT: s_mov_b32 s52, s18 +; GCN-O0-NEXT: s_mov_b32 s53, s17 +; GCN-O0-NEXT: s_mov_b32 s54, s16 +; GCN-O0-NEXT: s_mov_b32 s55, s15 +; GCN-O0-NEXT: s_mov_b32 s56, s14 +; GCN-O0-NEXT: s_mov_b32 s57, s13 +; GCN-O0-NEXT: s_mov_b32 s58, s12 +; GCN-O0-NEXT: s_mov_b32 s59, s11 +; GCN-O0-NEXT: s_mov_b32 s60, s10 +; GCN-O0-NEXT: s_mov_b32 s61, s9 +; GCN-O0-NEXT: s_mov_b32 s62, s8 +; GCN-O0-NEXT: s_mov_b32 s63, s7 +; GCN-O0-NEXT: s_mov_b32 s64, s6 +; GCN-O0-NEXT: s_mov_b32 s65, s5 +; GCN-O0-NEXT: s_mov_b32 s66, s4 +; GCN-O0-NEXT: s_mov_b32 s67, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s67 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <15 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -572,6 +1421,181 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double16_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s5, 0x40300000 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s7, 0x402e0000 +; GCN-O0-NEXT: s_mov_b32 s5, s7 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s8, 0 +; GCN-O0-NEXT: s_mov_b32 s9, 0x402c0000 +; GCN-O0-NEXT: s_mov_b32 s7, s9 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s10, 0 +; GCN-O0-NEXT: s_mov_b32 s11, 0x402a0000 +; GCN-O0-NEXT: s_mov_b32 s9, s11 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s12, 0 +; GCN-O0-NEXT: s_mov_b32 s13, 0x40280000 +; GCN-O0-NEXT: s_mov_b32 s11, s13 +; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 +; GCN-O0-NEXT: s_mov_b32 s14, 0 +; GCN-O0-NEXT: s_mov_b32 s15, 0x40260000 +; GCN-O0-NEXT: s_mov_b32 s13, s15 +; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 killed $sgpr14_sgpr15 +; GCN-O0-NEXT: s_mov_b32 s16, 0 +; GCN-O0-NEXT: s_mov_b32 s17, 0x40240000 +; GCN-O0-NEXT: s_mov_b32 s15, s17 +; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GCN-O0-NEXT: s_mov_b32 s18, 0 +; GCN-O0-NEXT: s_mov_b32 s19, 0x40220000 +; GCN-O0-NEXT: s_mov_b32 s17, s19 +; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 killed $sgpr18_sgpr19 +; GCN-O0-NEXT: s_mov_b32 s20, 0 +; GCN-O0-NEXT: s_mov_b32 s21, 0x40200000 +; GCN-O0-NEXT: s_mov_b32 s19, s21 +; GCN-O0-NEXT: ; kill: def $sgpr20 killed $sgpr20 killed $sgpr20_sgpr21 +; GCN-O0-NEXT: s_mov_b32 s22, 0 +; GCN-O0-NEXT: s_mov_b32 s23, 0x401c0000 +; GCN-O0-NEXT: s_mov_b32 s21, s23 +; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 killed $sgpr22_sgpr23 +; GCN-O0-NEXT: s_mov_b32 s24, 0 +; GCN-O0-NEXT: s_mov_b32 s25, 0x40180000 +; GCN-O0-NEXT: s_mov_b32 s23, s25 +; GCN-O0-NEXT: ; kill: def $sgpr24 killed $sgpr24 killed $sgpr24_sgpr25 +; GCN-O0-NEXT: s_mov_b32 s26, 0 +; GCN-O0-NEXT: s_mov_b32 s27, 0x40140000 +; GCN-O0-NEXT: s_mov_b32 s25, s27 +; GCN-O0-NEXT: ; kill: def $sgpr26 killed $sgpr26 killed $sgpr26_sgpr27 +; GCN-O0-NEXT: s_mov_b64 s[28:29], 4.0 +; GCN-O0-NEXT: s_mov_b32 s27, s29 +; GCN-O0-NEXT: ; kill: def $sgpr28 killed $sgpr28 killed $sgpr28_sgpr29 +; GCN-O0-NEXT: s_mov_b32 s30, 0 +; GCN-O0-NEXT: s_mov_b32 s31, 0x40080000 +; GCN-O0-NEXT: s_mov_b32 s29, s31 +; GCN-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 killed $sgpr30_sgpr31 +; GCN-O0-NEXT: s_mov_b64 s[34:35], 2.0 +; GCN-O0-NEXT: s_mov_b32 s31, s35 +; GCN-O0-NEXT: s_mov_b32 s33, s34 +; GCN-O0-NEXT: s_mov_b64 s[36:37], 1.0 +; GCN-O0-NEXT: s_mov_b32 s34, s37 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GCN-O0-NEXT: s_mov_b32 s37, s34 +; GCN-O0-NEXT: s_mov_b32 s38, s33 +; GCN-O0-NEXT: s_mov_b32 s39, s31 +; GCN-O0-NEXT: s_mov_b32 s40, s30 +; GCN-O0-NEXT: s_mov_b32 s41, s29 +; GCN-O0-NEXT: s_mov_b32 s42, s28 +; GCN-O0-NEXT: s_mov_b32 s43, s27 +; GCN-O0-NEXT: s_mov_b32 s44, s26 +; GCN-O0-NEXT: s_mov_b32 s45, s25 +; GCN-O0-NEXT: s_mov_b32 s46, s24 +; GCN-O0-NEXT: s_mov_b32 s47, s23 +; GCN-O0-NEXT: s_mov_b32 s48, s22 +; GCN-O0-NEXT: s_mov_b32 s49, s21 +; GCN-O0-NEXT: s_mov_b32 s50, s20 +; GCN-O0-NEXT: s_mov_b32 s51, s19 +; GCN-O0-NEXT: s_mov_b32 s52, s18 +; GCN-O0-NEXT: s_mov_b32 s53, s17 +; GCN-O0-NEXT: s_mov_b32 s54, s16 +; GCN-O0-NEXT: s_mov_b32 s55, s15 +; GCN-O0-NEXT: s_mov_b32 s56, s14 +; GCN-O0-NEXT: s_mov_b32 s57, s13 +; GCN-O0-NEXT: s_mov_b32 s58, s12 +; GCN-O0-NEXT: s_mov_b32 s59, s11 +; GCN-O0-NEXT: s_mov_b32 s60, s10 +; GCN-O0-NEXT: s_mov_b32 s61, s9 +; GCN-O0-NEXT: s_mov_b32 s62, s8 +; GCN-O0-NEXT: s_mov_b32 s63, s7 +; GCN-O0-NEXT: s_mov_b32 s64, s6 +; GCN-O0-NEXT: s_mov_b32 s65, s5 +; GCN-O0-NEXT: s_mov_b32 s66, s4 +; GCN-O0-NEXT: s_mov_b32 s67, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s67 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <16 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -622,6 +1646,114 @@ define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float32_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x42000000 +; GCN-O0-NEXT: s_mov_b32 s4, 0x41f80000 +; GCN-O0-NEXT: s_mov_b32 s5, 0x41f00000 +; GCN-O0-NEXT: s_mov_b32 s6, 0x41e80000 +; GCN-O0-NEXT: s_mov_b32 s7, 0x41e00000 +; GCN-O0-NEXT: s_mov_b32 s8, 0x41d80000 +; GCN-O0-NEXT: s_mov_b32 s9, 0x41d00000 +; GCN-O0-NEXT: s_mov_b32 s10, 0x41c80000 +; GCN-O0-NEXT: s_mov_b32 s11, 0x41c00000 +; GCN-O0-NEXT: s_mov_b32 s12, 0x41b80000 +; GCN-O0-NEXT: s_mov_b32 s13, 0x41b00000 +; GCN-O0-NEXT: s_mov_b32 s14, 0x41a80000 +; GCN-O0-NEXT: s_mov_b32 s15, 0x41a00000 +; GCN-O0-NEXT: s_mov_b32 s16, 0x41980000 +; GCN-O0-NEXT: s_mov_b32 s17, 0x41900000 +; GCN-O0-NEXT: s_mov_b32 s18, 0x41880000 +; GCN-O0-NEXT: s_mov_b32 s19, 0x41800000 +; GCN-O0-NEXT: s_mov_b32 s20, 0x41700000 +; GCN-O0-NEXT: s_mov_b32 s21, 0x41600000 +; GCN-O0-NEXT: s_mov_b32 s22, 0x41500000 +; GCN-O0-NEXT: s_mov_b32 s23, 0x41400000 +; GCN-O0-NEXT: s_mov_b32 s24, 0x41300000 +; GCN-O0-NEXT: s_mov_b32 s25, 0x41200000 +; GCN-O0-NEXT: s_mov_b32 s26, 0x41100000 +; GCN-O0-NEXT: s_mov_b32 s27, 0x41000000 +; GCN-O0-NEXT: s_mov_b32 s28, 0x40e00000 +; GCN-O0-NEXT: s_mov_b32 s29, 0x40c00000 +; GCN-O0-NEXT: s_mov_b32 s30, 0x40a00000 +; GCN-O0-NEXT: s_mov_b32 s31, 4.0 +; GCN-O0-NEXT: s_mov_b32 s33, 0x40400000 +; GCN-O0-NEXT: s_mov_b32 s34, 2.0 +; GCN-O0-NEXT: s_mov_b32 s35, 1.0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s35 +; GCN-O0-NEXT: v_mov_b32_e32 v62, s34 +; GCN-O0-NEXT: v_mov_b32_e32 v61, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v60, s31 +; GCN-O0-NEXT: v_mov_b32_e32 v59, s30 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s29 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s28 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s27 +; GCN-O0-NEXT: v_mov_b32_e32 v55, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v54, s25 +; GCN-O0-NEXT: v_mov_b32_e32 v53, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v47, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v46, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v45, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v39, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v38, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v37, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v36, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v35, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v34, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v33, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s3 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <32 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -643,6 +1775,25 @@ define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: byte8_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s1, 3 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s4, s0, s1 +; GCN-O0-NEXT: s_mov_b32 s5, 0x8070605 +; GCN-O0-NEXT: s_mov_b32 s0, 0x4030201 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x i8> , i32 %sel store i8 %ext, ptr addrspace(1) %out @@ -690,6 +1841,61 @@ define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: byte16_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s14, -1 +; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 +; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 15 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_or_b32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 16 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:15 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 15 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 14 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:13 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 13 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 12 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:11 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 11 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 10 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 9 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 8 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 7 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 6 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 5 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 3 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: buffer_load_ubyte v2, v0, s[12:15], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <16 x i8> , i32 %sel store i8 %ext, ptr addrspace(1) %out @@ -710,6 +1916,23 @@ define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: bit4_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s1, 3 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s1, s0, s1 +; GCN-O0-NEXT: s_mov_b32 s0, 0x1000100 +; GCN-O0-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_and_b32 s0, s0, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x i1> , i32 %sel %zext = zext i1 %ext to i32 @@ -985,6 +2208,161 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: bit128_extelt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s14, -1 +; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 +; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: s_mov_b32 s3, 0x7f +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:127 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:126 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:125 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:124 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:123 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:122 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:121 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:120 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:119 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:118 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:117 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:116 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:115 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:114 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:113 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:112 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:111 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:110 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:109 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:108 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:107 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:106 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:105 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:104 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:103 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:102 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:101 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:100 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:99 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:98 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:97 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:96 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:95 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:94 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:93 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:92 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:91 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:90 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:89 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:88 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:87 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:86 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:85 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:84 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:83 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:82 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:81 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:80 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:79 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:78 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:77 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:76 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:75 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:74 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:73 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:72 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:71 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:70 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:69 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:68 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:67 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:66 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:65 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:64 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:63 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:62 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:61 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:60 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:59 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:58 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:57 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:56 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:55 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:54 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:53 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:52 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:51 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:50 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:49 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:48 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:47 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:46 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:45 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:44 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:43 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:42 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:41 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:40 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:39 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:38 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:37 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:36 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:35 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:34 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:33 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:32 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:31 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:30 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:29 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:28 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:27 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:26 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:25 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:24 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:23 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:22 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:21 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:20 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:19 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:18 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:17 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:16 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:15 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:13 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:11 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:9 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:7 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:5 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:3 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: buffer_load_ubyte v0, v0, s[12:15], 0 offen +; GCN-O0-NEXT: s_mov_b32 s2, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v2, v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <128 x i1> , i32 %sel %zext = zext i1 %ext to i32 @@ -1088,6 +2466,253 @@ define float @float32_extelt_vec(i32 %sel) { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 31, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: float32_extelt_vec: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, 0x42000000 +; GCN-O0-NEXT: s_mov_b32 s5, 0x41f80000 +; GCN-O0-NEXT: s_mov_b32 s6, 0x41f00000 +; GCN-O0-NEXT: s_mov_b32 s7, 0x41e80000 +; GCN-O0-NEXT: s_mov_b32 s8, 0x41e00000 +; GCN-O0-NEXT: s_mov_b32 s9, 0x41d80000 +; GCN-O0-NEXT: s_mov_b32 s10, 0x41d00000 +; GCN-O0-NEXT: s_mov_b32 s11, 0x41c80000 +; GCN-O0-NEXT: s_mov_b32 s12, 0x41c00000 +; GCN-O0-NEXT: s_mov_b32 s13, 0x41b80000 +; GCN-O0-NEXT: s_mov_b32 s14, 0x41b00000 +; GCN-O0-NEXT: s_mov_b32 s15, 0x41a80000 +; GCN-O0-NEXT: s_mov_b32 s16, 0x41a00000 +; GCN-O0-NEXT: s_mov_b32 s17, 0x41980000 +; GCN-O0-NEXT: s_mov_b32 s18, 0x41900000 +; GCN-O0-NEXT: s_mov_b32 s19, 0x41880000 +; GCN-O0-NEXT: s_mov_b32 s20, 0x41800000 +; GCN-O0-NEXT: s_mov_b32 s21, 0x41700000 +; GCN-O0-NEXT: s_mov_b32 s22, 0x41600000 +; GCN-O0-NEXT: s_mov_b32 s23, 0x41500000 +; GCN-O0-NEXT: s_mov_b32 s24, 0x41400000 +; GCN-O0-NEXT: s_mov_b32 s25, 0x41300000 +; GCN-O0-NEXT: s_mov_b32 s26, 0x41200000 +; GCN-O0-NEXT: s_mov_b32 s27, 0x41100000 +; GCN-O0-NEXT: s_mov_b32 s28, 0x41000000 +; GCN-O0-NEXT: s_mov_b32 s29, 0x40e00000 +; GCN-O0-NEXT: s_mov_b32 s40, 0x40c00000 +; GCN-O0-NEXT: s_mov_b32 s41, 0x40a00000 +; GCN-O0-NEXT: s_mov_b32 s42, 4.0 +; GCN-O0-NEXT: s_mov_b32 s43, 0x40400000 +; GCN-O0-NEXT: s_mov_b32 s44, 2.0 +; GCN-O0-NEXT: s_mov_b32 s45, 1.0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v62, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v61, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v60, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v59, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 +; GCN-O0-NEXT: v_mov_b32_e32 v47, s27 +; GCN-O0-NEXT: v_mov_b32_e32 v46, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v45, s25 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v55, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v54, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v53, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v38, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v37, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v36, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v35, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v34, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v33, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v32, s4 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr63 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v63, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v63, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GCN-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB20_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GCN-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v63, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v63, 3 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v63, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v63, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GCN-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB20_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GCN-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v63, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v63, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <32 x float> , i32 %sel ret float %ext @@ -1163,7 +2788,1692 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x40301999 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: double16_extelt_vec: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v34, s36, 0 +; GCN-O0-NEXT: v_writelane_b32 v34, s37, 1 +; GCN-O0-NEXT: v_writelane_b32 v34, s38, 2 +; GCN-O0-NEXT: v_writelane_b32 v34, s39, 3 +; GCN-O0-NEXT: v_writelane_b32 v34, s48, 4 +; GCN-O0-NEXT: v_writelane_b32 v34, s49, 5 +; GCN-O0-NEXT: v_writelane_b32 v34, s50, 6 +; GCN-O0-NEXT: v_writelane_b32 v34, s51, 7 +; GCN-O0-NEXT: v_writelane_b32 v34, s52, 8 +; GCN-O0-NEXT: v_writelane_b32 v34, s53, 9 +; GCN-O0-NEXT: v_writelane_b32 v34, s54, 10 +; GCN-O0-NEXT: v_writelane_b32 v34, s55, 11 +; GCN-O0-NEXT: v_writelane_b32 v34, s64, 12 +; GCN-O0-NEXT: v_writelane_b32 v34, s65, 13 +; GCN-O0-NEXT: v_writelane_b32 v34, s66, 14 +; GCN-O0-NEXT: v_writelane_b32 v34, s67, 15 +; GCN-O0-NEXT: s_mov_b32 s4, 0x40301999 +; GCN-O0-NEXT: s_mov_b32 s40, 0x9999999a +; GCN-O0-NEXT: s_mov_b32 s6, s40 +; GCN-O0-NEXT: s_mov_b32 s7, s4 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s6, 0x402e3333 +; GCN-O0-NEXT: s_mov_b32 s22, 0x33333333 +; GCN-O0-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v35, s22, 0 +; GCN-O0-NEXT: s_mov_b32 s8, s22 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s8, 0x402c3333 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: s_mov_b32 s11, s8 +; GCN-O0-NEXT: s_mov_b32 s8, s11 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: s_mov_b32 s10, 0x402a3333 +; GCN-O0-NEXT: s_mov_b32 s12, s22 +; GCN-O0-NEXT: s_mov_b32 s13, s10 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s12, 0x40283333 +; GCN-O0-NEXT: s_mov_b32 s14, s22 +; GCN-O0-NEXT: s_mov_b32 s15, s12 +; GCN-O0-NEXT: s_mov_b32 s12, s15 +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: s_mov_b32 s14, 0x40263333 +; GCN-O0-NEXT: s_mov_b32 s16, s22 +; GCN-O0-NEXT: s_mov_b32 s17, s14 +; GCN-O0-NEXT: s_mov_b32 s14, s17 +; GCN-O0-NEXT: s_mov_b32 s15, s16 +; GCN-O0-NEXT: s_mov_b32 s16, 0x40243333 +; GCN-O0-NEXT: s_mov_b32 s18, s22 +; GCN-O0-NEXT: s_mov_b32 s19, s16 +; GCN-O0-NEXT: s_mov_b32 s16, s19 +; GCN-O0-NEXT: s_mov_b32 s17, s18 +; GCN-O0-NEXT: s_mov_b32 s18, 0x40223333 +; GCN-O0-NEXT: s_mov_b32 s20, s22 +; GCN-O0-NEXT: s_mov_b32 s21, s18 +; GCN-O0-NEXT: s_mov_b32 s18, s21 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: s_mov_b32 s20, 0x40203333 +; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 def $sgpr22_sgpr23 +; GCN-O0-NEXT: s_mov_b32 s23, s20 +; GCN-O0-NEXT: s_mov_b32 s20, s23 +; GCN-O0-NEXT: s_mov_b32 s21, s22 +; GCN-O0-NEXT: s_mov_b32 s22, 0x401c6666 +; GCN-O0-NEXT: s_mov_b32 s42, 0x66666666 +; GCN-O0-NEXT: s_mov_b32 s24, s42 +; GCN-O0-NEXT: s_mov_b32 s25, s22 +; GCN-O0-NEXT: s_mov_b32 s22, s25 +; GCN-O0-NEXT: s_mov_b32 s23, s24 +; GCN-O0-NEXT: s_mov_b32 s24, 0x40186666 +; GCN-O0-NEXT: s_mov_b32 s26, s42 +; GCN-O0-NEXT: s_mov_b32 s27, s24 +; GCN-O0-NEXT: s_mov_b32 s24, s27 +; GCN-O0-NEXT: s_mov_b32 s25, s26 +; GCN-O0-NEXT: s_mov_b32 s26, 0x40146666 +; GCN-O0-NEXT: s_mov_b32 s28, s42 +; GCN-O0-NEXT: s_mov_b32 s29, s26 +; GCN-O0-NEXT: s_mov_b32 s26, s29 +; GCN-O0-NEXT: s_mov_b32 s27, s28 +; GCN-O0-NEXT: s_mov_b32 s28, 0x40106666 +; GCN-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 +; GCN-O0-NEXT: s_mov_b32 s43, s28 +; GCN-O0-NEXT: s_mov_b32 s28, s43 +; GCN-O0-NEXT: s_mov_b32 s29, s42 +; GCN-O0-NEXT: s_mov_b32 s41, 0x4008cccc +; GCN-O0-NEXT: s_mov_b32 s42, 0xcccccccd +; GCN-O0-NEXT: s_mov_b32 s44, s42 +; GCN-O0-NEXT: s_mov_b32 s45, s41 +; GCN-O0-NEXT: s_mov_b32 s72, s45 +; GCN-O0-NEXT: s_mov_b32 s73, s44 +; GCN-O0-NEXT: s_mov_b32 s41, 0x4000cccc +; GCN-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 +; GCN-O0-NEXT: s_mov_b32 s43, s41 +; GCN-O0-NEXT: s_mov_b32 s74, s43 +; GCN-O0-NEXT: s_mov_b32 s75, s42 +; GCN-O0-NEXT: s_mov_b32 s42, 0x3ff19999 +; GCN-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GCN-O0-NEXT: s_mov_b32 s41, s42 +; GCN-O0-NEXT: s_mov_b32 s76, s41 +; GCN-O0-NEXT: s_mov_b32 s36, s40 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GCN-O0-NEXT: s_mov_b32 s37, s76 +; GCN-O0-NEXT: s_mov_b32 s38, s75 +; GCN-O0-NEXT: s_mov_b32 s39, s74 +; GCN-O0-NEXT: s_mov_b32 s40, s73 +; GCN-O0-NEXT: s_mov_b32 s41, s72 +; GCN-O0-NEXT: s_mov_b32 s42, s29 +; GCN-O0-NEXT: s_mov_b32 s43, s28 +; GCN-O0-NEXT: s_mov_b32 s44, s27 +; GCN-O0-NEXT: s_mov_b32 s45, s26 +; GCN-O0-NEXT: s_mov_b32 s46, s25 +; GCN-O0-NEXT: s_mov_b32 s47, s24 +; GCN-O0-NEXT: s_mov_b32 s48, s23 +; GCN-O0-NEXT: s_mov_b32 s49, s22 +; GCN-O0-NEXT: s_mov_b32 s50, s21 +; GCN-O0-NEXT: s_mov_b32 s51, s20 +; GCN-O0-NEXT: s_mov_b32 s52, s19 +; GCN-O0-NEXT: s_mov_b32 s53, s18 +; GCN-O0-NEXT: s_mov_b32 s54, s17 +; GCN-O0-NEXT: s_mov_b32 s55, s16 +; GCN-O0-NEXT: s_mov_b32 s56, s15 +; GCN-O0-NEXT: s_mov_b32 s57, s14 +; GCN-O0-NEXT: s_mov_b32 s58, s13 +; GCN-O0-NEXT: s_mov_b32 s59, s12 +; GCN-O0-NEXT: s_mov_b32 s60, s11 +; GCN-O0-NEXT: s_mov_b32 s61, s10 +; GCN-O0-NEXT: s_mov_b32 s62, s9 +; GCN-O0-NEXT: s_mov_b32 s63, s8 +; GCN-O0-NEXT: s_mov_b32 s64, s7 +; GCN-O0-NEXT: s_mov_b32 s65, s6 +; GCN-O0-NEXT: s_mov_b32 s66, s5 +; GCN-O0-NEXT: s_mov_b32 s67, s4 +; GCN-O0-NEXT: v_writelane_b32 v35, s36, 1 +; GCN-O0-NEXT: v_writelane_b32 v35, s37, 2 +; GCN-O0-NEXT: v_writelane_b32 v35, s38, 3 +; GCN-O0-NEXT: v_writelane_b32 v35, s39, 4 +; GCN-O0-NEXT: v_writelane_b32 v35, s40, 5 +; GCN-O0-NEXT: v_writelane_b32 v35, s41, 6 +; GCN-O0-NEXT: v_writelane_b32 v35, s42, 7 +; GCN-O0-NEXT: v_writelane_b32 v35, s43, 8 +; GCN-O0-NEXT: v_writelane_b32 v35, s44, 9 +; GCN-O0-NEXT: v_writelane_b32 v35, s45, 10 +; GCN-O0-NEXT: v_writelane_b32 v35, s46, 11 +; GCN-O0-NEXT: v_writelane_b32 v35, s47, 12 +; GCN-O0-NEXT: v_writelane_b32 v35, s48, 13 +; GCN-O0-NEXT: v_writelane_b32 v35, s49, 14 +; GCN-O0-NEXT: v_writelane_b32 v35, s50, 15 +; GCN-O0-NEXT: v_writelane_b32 v35, s51, 16 +; GCN-O0-NEXT: v_writelane_b32 v35, s52, 17 +; GCN-O0-NEXT: v_writelane_b32 v35, s53, 18 +; GCN-O0-NEXT: v_writelane_b32 v35, s54, 19 +; GCN-O0-NEXT: v_writelane_b32 v35, s55, 20 +; GCN-O0-NEXT: v_writelane_b32 v35, s56, 21 +; GCN-O0-NEXT: v_writelane_b32 v35, s57, 22 +; GCN-O0-NEXT: v_writelane_b32 v35, s58, 23 +; GCN-O0-NEXT: v_writelane_b32 v35, s59, 24 +; GCN-O0-NEXT: v_writelane_b32 v35, s60, 25 +; GCN-O0-NEXT: v_writelane_b32 v35, s61, 26 +; GCN-O0-NEXT: v_writelane_b32 v35, s62, 27 +; GCN-O0-NEXT: v_writelane_b32 v35, s63, 28 +; GCN-O0-NEXT: v_writelane_b32 v35, s64, 29 +; GCN-O0-NEXT: v_writelane_b32 v35, s65, 30 +; GCN-O0-NEXT: v_writelane_b32 v35, s66, 31 +; GCN-O0-NEXT: v_writelane_b32 v35, s67, 32 +; GCN-O0-NEXT: s_mov_b32 s4, 1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v35, s4, 33 +; GCN-O0-NEXT: v_writelane_b32 v35, s5, 34 +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v35, 35 +; GCN-O0-NEXT: v_readlane_b32 s5, v35, 36 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v35, s6, 35 +; GCN-O0-NEXT: v_writelane_b32 v35, s7, 36 +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB21_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v35, 33 +; GCN-O0-NEXT: v_readlane_b32 s5, v35, 34 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s36, v35, 1 +; GCN-O0-NEXT: v_readlane_b32 s37, v35, 2 +; GCN-O0-NEXT: v_readlane_b32 s38, v35, 3 +; GCN-O0-NEXT: v_readlane_b32 s39, v35, 4 +; GCN-O0-NEXT: v_readlane_b32 s40, v35, 5 +; GCN-O0-NEXT: v_readlane_b32 s41, v35, 6 +; GCN-O0-NEXT: v_readlane_b32 s42, v35, 7 +; GCN-O0-NEXT: v_readlane_b32 s43, v35, 8 +; GCN-O0-NEXT: v_readlane_b32 s44, v35, 9 +; GCN-O0-NEXT: v_readlane_b32 s45, v35, 10 +; GCN-O0-NEXT: v_readlane_b32 s46, v35, 11 +; GCN-O0-NEXT: v_readlane_b32 s47, v35, 12 +; GCN-O0-NEXT: v_readlane_b32 s48, v35, 13 +; GCN-O0-NEXT: v_readlane_b32 s49, v35, 14 +; GCN-O0-NEXT: v_readlane_b32 s50, v35, 15 +; GCN-O0-NEXT: v_readlane_b32 s51, v35, 16 +; GCN-O0-NEXT: v_readlane_b32 s52, v35, 17 +; GCN-O0-NEXT: v_readlane_b32 s53, v35, 18 +; GCN-O0-NEXT: v_readlane_b32 s54, v35, 19 +; GCN-O0-NEXT: v_readlane_b32 s55, v35, 20 +; GCN-O0-NEXT: v_readlane_b32 s56, v35, 21 +; GCN-O0-NEXT: v_readlane_b32 s57, v35, 22 +; GCN-O0-NEXT: v_readlane_b32 s58, v35, 23 +; GCN-O0-NEXT: v_readlane_b32 s59, v35, 24 +; GCN-O0-NEXT: v_readlane_b32 s60, v35, 25 +; GCN-O0-NEXT: v_readlane_b32 s61, v35, 26 +; GCN-O0-NEXT: v_readlane_b32 s62, v35, 27 +; GCN-O0-NEXT: v_readlane_b32 s63, v35, 28 +; GCN-O0-NEXT: v_readlane_b32 s64, v35, 29 +; GCN-O0-NEXT: v_readlane_b32 s65, v35, 30 +; GCN-O0-NEXT: v_readlane_b32 s66, v35, 31 +; GCN-O0-NEXT: v_readlane_b32 s67, v35, 32 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 +; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v35, s4, 37 +; GCN-O0-NEXT: v_writelane_b32 v35, s5, 38 +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB21_4: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v35, 39 +; GCN-O0-NEXT: v_readlane_b32 s5, v35, 40 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v35, s6, 39 +; GCN-O0-NEXT: v_writelane_b32 v35, s7, 40 +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB21_4 +; GCN-O0-NEXT: ; %bb.5: +; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v35, 37 +; GCN-O0-NEXT: v_readlane_b32 s5, v35, 38 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.6: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-O0-NEXT: ; implicit-def: $sgpr4 +; GCN-O0-NEXT: ; implicit-def: $sgpr5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_mov_b32 s4, 32 +; GCN-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_readlane_b32 s67, v34, 15 +; GCN-O0-NEXT: v_readlane_b32 s66, v34, 14 +; GCN-O0-NEXT: v_readlane_b32 s65, v34, 13 +; GCN-O0-NEXT: v_readlane_b32 s64, v34, 12 +; GCN-O0-NEXT: v_readlane_b32 s55, v34, 11 +; GCN-O0-NEXT: v_readlane_b32 s54, v34, 10 +; GCN-O0-NEXT: v_readlane_b32 s53, v34, 9 +; GCN-O0-NEXT: v_readlane_b32 s52, v34, 8 +; GCN-O0-NEXT: v_readlane_b32 s51, v34, 7 +; GCN-O0-NEXT: v_readlane_b32 s50, v34, 6 +; GCN-O0-NEXT: v_readlane_b32 s49, v34, 5 +; GCN-O0-NEXT: v_readlane_b32 s48, v34, 4 +; GCN-O0-NEXT: v_readlane_b32 s39, v34, 3 +; GCN-O0-NEXT: v_readlane_b32 s38, v34, 2 +; GCN-O0-NEXT: v_readlane_b32 s37, v34, 1 +; GCN-O0-NEXT: v_readlane_b32 s36, v34, 0 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <16 x double> , i32 %sel ret double %ext } + +define i32 @extract_dyn_i32_3(<3 x i32> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_i32_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_i32_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB22_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v3 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB22_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <3 x i32> %arg, i32 %idx + ret i32 %x +} + +define i32 @extract_dyn_inreg_i32_3(<3 x i32> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_i32_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s19, 1 +; GCN-NEXT: s_cselect_b32 s4, s17, s16 +; GCN-NEXT: s_cmp_eq_u32 s19, 2 +; GCN-NEXT: s_cselect_b32 s4, s18, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_i32_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: s_mov_b32 m0, s19 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <3 x i32> %arg, i32 %idx + ret i32 %x +} + +define float @extract_dyn_float_3(<3 x float> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_float_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_float_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v3 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB24_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <3 x float> %arg, i32 %idx + ret float %x +} + +define float @extract_dyn_inreg_float_3(<3 x float> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_float_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s19, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_float_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: s_mov_b32 m0, s19 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <3 x float> %arg, i32 %idx + ret float %x +} + +define i32 @extract_dyn_i32_5(<5 x i32> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_i32_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_i32_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v7, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v7, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v7, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v7, 3 +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v5 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB26_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v7, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v7, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <5 x i32> %arg, i32 %idx + ret i32 %x +} + +define i32 @extract_dyn_inreg_i32_5(<5 x i32> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_i32_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s21, 1 +; GCN-NEXT: s_cselect_b32 s4, s17, s16 +; GCN-NEXT: s_cmp_eq_u32 s21, 2 +; GCN-NEXT: s_cselect_b32 s4, s18, s4 +; GCN-NEXT: s_cmp_eq_u32 s21, 3 +; GCN-NEXT: s_cselect_b32 s4, s19, s4 +; GCN-NEXT: s_cmp_eq_u32 s21, 4 +; GCN-NEXT: s_cselect_b32 s4, s20, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_i32_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: s_mov_b32 m0, s21 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <5 x i32> %arg, i32 %idx + ret i32 %x +} + +define float @extract_dyn_float_5(<5 x float> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_float_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_float_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v7, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v7, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v7, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v7, 3 +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v5 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB28_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v7, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v7, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <5 x float> %arg, i32 %idx + ret float %x +} + +define float @extract_dyn_inreg_float_5(<5 x float> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_float_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s21, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_float_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: s_mov_b32 m0, s21 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <5 x float> %arg, i32 %idx + ret float %x +} + +define i32 @extract_dyn_i32_6(<6 x i32> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_i32_6: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_i32_6: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v8, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v8, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB30_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v8, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v8, 3 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v8, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v8, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB30_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v8, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v8, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] +entry: + %x = extractelement <6 x i32> %arg, i32 %idx + ret i32 %x +} + +define i32 @extract_dyn_inreg_i32_6(<6 x i32> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_i32_6: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s22, 1 +; GCN-NEXT: s_cselect_b32 s4, s17, s16 +; GCN-NEXT: s_cmp_eq_u32 s22, 2 +; GCN-NEXT: s_cselect_b32 s4, s18, s4 +; GCN-NEXT: s_cmp_eq_u32 s22, 3 +; GCN-NEXT: s_cselect_b32 s4, s19, s4 +; GCN-NEXT: s_cmp_eq_u32 s22, 4 +; GCN-NEXT: s_cselect_b32 s4, s20, s4 +; GCN-NEXT: s_cmp_eq_u32 s22, 5 +; GCN-NEXT: s_cselect_b32 s4, s21, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_i32_6: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: s_mov_b32 m0, s22 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] +entry: + %x = extractelement <6 x i32> %arg, i32 %idx + ret i32 %x +} + +define float @extract_dyn_float_6(<6 x float> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_float_6: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_float_6: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v8, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v8, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB32_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v8, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v8, 3 +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v8, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v8, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB32_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v8, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v8, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] +entry: + %x = extractelement <6 x float> %arg, i32 %idx + ret float %x +} + +define float @extract_dyn_inreg_float_6(<6 x float> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_float_6: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s22, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_float_6: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: s_mov_b32 m0, s22 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] +entry: + %x = extractelement <6 x float> %arg, i32 %idx + ret float %x +} + +define i32 @extract_dyn_i32_7(<7 x i32> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_i32_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_i32_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB34_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB34_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <7 x i32> %arg, i32 %idx + ret i32 %x +} + +define i32 @extract_dyn_inreg_i32_7(<7 x i32> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_i32_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s23, 1 +; GCN-NEXT: s_cselect_b32 s4, s17, s16 +; GCN-NEXT: s_cmp_eq_u32 s23, 2 +; GCN-NEXT: s_cselect_b32 s4, s18, s4 +; GCN-NEXT: s_cmp_eq_u32 s23, 3 +; GCN-NEXT: s_cselect_b32 s4, s19, s4 +; GCN-NEXT: s_cmp_eq_u32 s23, 4 +; GCN-NEXT: s_cselect_b32 s4, s20, s4 +; GCN-NEXT: s_cmp_eq_u32 s23, 5 +; GCN-NEXT: s_cselect_b32 s4, s21, s4 +; GCN-NEXT: s_cmp_eq_u32 s23, 6 +; GCN-NEXT: s_cselect_b32 s4, s22, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_i32_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: s_mov_b32 m0, s23 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <7 x i32> %arg, i32 %idx + ret i32 %x +} + +define float @extract_dyn_float_7(<7 x float> inreg %arg, i32 %idx) { +; GCN-LABEL: extract_dyn_float_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_float_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB36_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB36_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <7 x float> %arg, i32 %idx + ret float %x +} + +define float @extract_dyn_inreg_float_7(<7 x float> inreg %arg, i32 inreg %idx) { +; GCN-LABEL: extract_dyn_inreg_float_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s23, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: extract_dyn_inreg_float_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: s_mov_b32 m0, s23 +; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = extractelement <7 x float> %arg, i32 %idx + ret float %x +} diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..beeeaa32cacfd 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: @@ -28,6 +29,25 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float4_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel store <4 x float> %v, ptr addrspace(1) %out @@ -47,6 +67,24 @@ define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float4_inselt_undef: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x float> poison, float 1.000000e+00, i32 %sel store <4 x float> %v, ptr addrspace(1) %out @@ -76,6 +114,25 @@ define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i3 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: int4_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x i32> %vec, i32 1, i32 %sel store <4 x i32> %v, ptr addrspace(1) %out @@ -100,6 +157,23 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float2_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel store <2 x float> %v, ptr addrspace(1) %out @@ -133,6 +207,57 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float8_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x64 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v14, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s4 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel store <8 x float> %v, ptr addrspace(1) %out @@ -186,6 +311,105 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float16_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0xa4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v22, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s4 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel store <16 x float> %v, ptr addrspace(1) %out @@ -280,6 +504,267 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: float32_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s2, s51 +; GCN-O0-NEXT: s_mov_b32 s3, s50 +; GCN-O0-NEXT: s_mov_b32 s6, s49 +; GCN-O0-NEXT: s_mov_b32 s7, s48 +; GCN-O0-NEXT: s_mov_b32 s8, s47 +; GCN-O0-NEXT: s_mov_b32 s9, s46 +; GCN-O0-NEXT: s_mov_b32 s10, s45 +; GCN-O0-NEXT: s_mov_b32 s11, s44 +; GCN-O0-NEXT: s_mov_b32 s12, s43 +; GCN-O0-NEXT: s_mov_b32 s13, s42 +; GCN-O0-NEXT: s_mov_b32 s14, s41 +; GCN-O0-NEXT: s_mov_b32 s15, s40 +; GCN-O0-NEXT: s_mov_b32 s16, s39 +; GCN-O0-NEXT: s_mov_b32 s17, s38 +; GCN-O0-NEXT: s_mov_b32 s18, s37 +; GCN-O0-NEXT: s_mov_b32 s19, s36 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s20, s51 +; GCN-O0-NEXT: s_mov_b32 s21, s50 +; GCN-O0-NEXT: s_mov_b32 s22, s49 +; GCN-O0-NEXT: s_mov_b32 s23, s48 +; GCN-O0-NEXT: s_mov_b32 s24, s47 +; GCN-O0-NEXT: s_mov_b32 s25, s46 +; GCN-O0-NEXT: s_mov_b32 s26, s45 +; GCN-O0-NEXT: s_mov_b32 s27, s44 +; GCN-O0-NEXT: s_mov_b32 s28, s43 +; GCN-O0-NEXT: s_mov_b32 s29, s42 +; GCN-O0-NEXT: s_mov_b32 s30, s41 +; GCN-O0-NEXT: s_mov_b32 s31, s40 +; GCN-O0-NEXT: s_mov_b32 s33, s39 +; GCN-O0-NEXT: s_mov_b32 s34, s38 +; GCN-O0-NEXT: s_mov_b32 s35, s37 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 +; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 +; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 +; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 +; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v35 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel store <32 x float> %v, ptr addrspace(1) %out @@ -305,6 +790,30 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: half4_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-O0-NEXT: s_mov_b32 s7, 0x3c003c00 +; GCN-O0-NEXT: s_mov_b32 s0, s7 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GCN-O0-NEXT: s_mov_b32 s7, 4 +; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff +; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel store <4 x half> %v, ptr addrspace(1) %out @@ -326,6 +835,26 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: half2_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c +; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30 +; GCN-O0-NEXT: s_mov_b32 s0, 0x3c003c00 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_xor_b32 s0, s1, s0 +; GCN-O0-NEXT: s_mov_b32 s5, 4 +; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5 +; GCN-O0-NEXT: s_mov_b32 s4, 0xffff +; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 +; GCN-O0-NEXT: s_and_b32 s0, s0, s4 +; GCN-O0-NEXT: s_xor_b32 s0, s0, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel store <2 x half> %v, ptr addrspace(1) %out @@ -387,6 +916,56 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: half8_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s14, -1 +; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 +; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-O0-NEXT: s_mov_b32 s3, 7 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_lshl_b32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_or_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_store_short v0, v1, s[12:15], 0 offen +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel store <8 x half> %v, ptr addrspace(1) %out @@ -408,6 +987,26 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: short2_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c +; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30 +; GCN-O0-NEXT: s_mov_b32 s0, 0x10001 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_xor_b32 s0, s1, s0 +; GCN-O0-NEXT: s_mov_b32 s5, 4 +; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5 +; GCN-O0-NEXT: s_mov_b32 s4, 0xffff +; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 +; GCN-O0-NEXT: s_and_b32 s0, s0, s4 +; GCN-O0-NEXT: s_xor_b32 s0, s0, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_dword v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x i16> %vec, i16 1, i32 %sel store <2 x i16> %v, ptr addrspace(1) %out @@ -433,6 +1032,30 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: short4_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-O0-NEXT: s_mov_b32 s7, 0x10001 +; GCN-O0-NEXT: s_mov_b32 s0, s7 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GCN-O0-NEXT: s_mov_b32 s7, 4 +; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff +; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x i16> %vec, i16 1, i32 %sel store <4 x i16> %v, ptr addrspace(1) %out @@ -457,6 +1080,140 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: byte8_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-O0-NEXT: s_mov_b32 s7, 0x1010101 +; GCN-O0-NEXT: s_mov_b32 s0, s7 +; GCN-O0-NEXT: s_mov_b32 s1, s7 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GCN-O0-NEXT: s_mov_b32 s7, 3 +; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xff +; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GCN-O0-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] +; GCN-O0-NEXT: s_mov_b32 s3, s10 +; GCN-O0-NEXT: s_mov_b32 s0, 8 +; GCN-O0-NEXT: s_lshr_b32 s0, s3, s0 +; GCN-O0-NEXT: s_mov_b32 s1, s10 +; GCN-O0-NEXT: s_mov_b32 s2, 16 +; GCN-O0-NEXT: s_lshr_b32 s2, s3, s2 +; GCN-O0-NEXT: s_mov_b32 s6, 24 +; GCN-O0-NEXT: s_lshr_b32 s3, s3, s6 +; GCN-O0-NEXT: s_mov_b32 s6, 32 +; GCN-O0-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, 40 +; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s7 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s8, 48 +; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s8 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, 56 +; GCN-O0-NEXT: s_lshr_b64 s[10:11], s[10:11], s9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: s_mov_b64 s[14:15], 7 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s11, s5 +; GCN-O0-NEXT: s_mov_b32 s13, s14 +; GCN-O0-NEXT: s_mov_b32 s12, s15 +; GCN-O0-NEXT: s_add_u32 s10, s10, s13 +; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_mov_b64 s[14:15], 6 +; GCN-O0-NEXT: s_mov_b32 s10, s4 +; GCN-O0-NEXT: s_mov_b32 s9, s5 +; GCN-O0-NEXT: s_mov_b32 s12, s14 +; GCN-O0-NEXT: s_mov_b32 s11, s15 +; GCN-O0-NEXT: s_add_u32 s10, s10, s12 +; GCN-O0-NEXT: s_addc_u32 s9, s9, s11 +; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s11, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 5 +; GCN-O0-NEXT: s_mov_b32 s8, s4 +; GCN-O0-NEXT: s_mov_b32 s9, s5 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 4 +; GCN-O0-NEXT: s_mov_b32 s8, s4 +; GCN-O0-NEXT: s_mov_b32 s7, s5 +; GCN-O0-NEXT: s_mov_b32 s10, s12 +; GCN-O0-NEXT: s_mov_b32 s9, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s10 +; GCN-O0-NEXT: s_addc_u32 s7, s7, s9 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_mov_b64 s[10:11], 3 +; GCN-O0-NEXT: s_mov_b32 s6, s4 +; GCN-O0-NEXT: s_mov_b32 s7, s5 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: s_mov_b32 s8, s11 +; GCN-O0-NEXT: s_add_u32 s6, s6, s9 +; GCN-O0-NEXT: s_addc_u32 s8, s7, s8 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_mov_b64 s[10:11], 2 +; GCN-O0-NEXT: s_mov_b32 s6, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s8, s10 +; GCN-O0-NEXT: s_mov_b32 s7, s11 +; GCN-O0-NEXT: s_add_u32 s6, s6, s8 +; GCN-O0-NEXT: s_addc_u32 s3, s3, s7 +; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b32 s7, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s2 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s4, s6 +; GCN-O0-NEXT: s_mov_b32 s3, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s4 +; GCN-O0-NEXT: s_addc_u32 s1, s1, s3 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel store <8 x i8> %v, ptr addrspace(1) %out @@ -558,6 +1315,435 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: byte16_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s14, -1 +; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 +; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 53 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 +; GCN-O0-NEXT: flat_load_ubyte v1, v[1:2] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 54 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 +; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 55 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 56 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 +; GCN-O0-NEXT: flat_load_ubyte v4, v[4:5] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 57 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 +; GCN-O0-NEXT: flat_load_ubyte v5, v[5:6] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 58 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s0 +; GCN-O0-NEXT: flat_load_ubyte v6, v[6:7] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 59 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s0 +; GCN-O0-NEXT: flat_load_ubyte v7, v[7:8] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 60 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s0 +; GCN-O0-NEXT: flat_load_ubyte v8, v[8:9] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 61 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s0 +; GCN-O0-NEXT: flat_load_ubyte v9, v[9:10] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 62 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s0 +; GCN-O0-NEXT: flat_load_ubyte v10, v[10:11] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 63 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s0 +; GCN-O0-NEXT: flat_load_ubyte v11, v[11:12] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s0 +; GCN-O0-NEXT: flat_load_ubyte v12, v[12:13] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x41 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s0 +; GCN-O0-NEXT: flat_load_ubyte v13, v[13:14] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x42 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s0 +; GCN-O0-NEXT: flat_load_ubyte v14, v[14:15] +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x43 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s0 +; GCN-O0-NEXT: flat_load_ubyte v15, v[15:16] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 +; GCN-O0-NEXT: s_mov_b32 s3, 15 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_or_b32 s2, s2, s3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v15, off, s[12:15], 0 offset:15 +; GCN-O0-NEXT: buffer_store_byte v14, off, s[12:15], 0 offset:14 +; GCN-O0-NEXT: buffer_store_byte v13, off, s[12:15], 0 offset:13 +; GCN-O0-NEXT: buffer_store_byte v12, off, s[12:15], 0 offset:12 +; GCN-O0-NEXT: buffer_store_byte v11, off, s[12:15], 0 offset:11 +; GCN-O0-NEXT: buffer_store_byte v10, off, s[12:15], 0 offset:10 +; GCN-O0-NEXT: buffer_store_byte v9, off, s[12:15], 0 offset:9 +; GCN-O0-NEXT: buffer_store_byte v8, off, s[12:15], 0 offset:8 +; GCN-O0-NEXT: buffer_store_byte v7, off, s[12:15], 0 offset:7 +; GCN-O0-NEXT: buffer_store_byte v6, off, s[12:15], 0 offset:6 +; GCN-O0-NEXT: buffer_store_byte v5, off, s[12:15], 0 offset:5 +; GCN-O0-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:3 +; GCN-O0-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 +; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen +; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 +; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[12:15], 0 offset:3 +; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[12:15], 0 offset:4 +; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[12:15], 0 offset:5 +; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[12:15], 0 offset:6 +; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[12:15], 0 offset:7 +; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[12:15], 0 offset:8 +; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[12:15], 0 offset:9 +; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[12:15], 0 offset:10 +; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[12:15], 0 offset:11 +; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[12:15], 0 offset:12 +; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[12:15], 0 offset:13 +; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[12:15], 0 offset:14 +; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[12:15], 0 offset:15 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 15 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: flat_store_byte v[0:1], v17 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 14 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v16 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 13 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v15 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 12 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v14 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 11 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v13 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 10 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v12 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 9 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v11 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 8 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v10 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 7 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v9 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 6 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v8 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 5 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 4 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v6 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 3 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v5 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 2 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v4 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_byte v[0:1], v3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <16 x i8> %vec, i8 1, i32 %sel store <16 x i8> %v, ptr addrspace(1) %out @@ -585,6 +1771,32 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double2_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v3, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel store <2 x double> %v, ptr addrspace(1) %out @@ -639,6 +1851,129 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double5_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x84 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s10, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s0 +; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s12, s27 +; GCN-O0-NEXT: s_mov_b32 s13, s26 +; GCN-O0-NEXT: s_mov_b32 s14, s25 +; GCN-O0-NEXT: s_mov_b32 s15, s24 +; GCN-O0-NEXT: s_mov_b32 s16, s23 +; GCN-O0-NEXT: s_mov_b32 s17, s22 +; GCN-O0-NEXT: s_mov_b32 s18, s21 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: ; implicit-def: $sgpr9 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr8 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr7 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr1 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr20 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v0 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: s_mov_b32 m0, s0 +; GCN-O0-NEXT: v_movreld_b32_e32 v1, v0 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: s_mov_b32 m0, s0 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v5 +; GCN-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23_vgpr24_vgpr25 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v23, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v20 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[20:21], v[22:25] +; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v20, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s2 +; GCN-O0-NEXT: flat_store_dwordx4 v[17:18], v[19:22] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v9 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel store <5 x double> %v, ptr addrspace(1) %out @@ -694,6 +2029,112 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double8_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v13, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v14, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v15, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v19, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v20, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v21, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v22, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel store <8 x double> %v, ptr addrspace(1) %out @@ -747,6 +2188,147 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double7_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x94 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s6, s1 +; GCN-O0-NEXT: s_mov_b32 s7, s0 +; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x84 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s8, s15 +; GCN-O0-NEXT: s_mov_b32 s9, s14 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s12, s27 +; GCN-O0-NEXT: s_mov_b32 s13, s26 +; GCN-O0-NEXT: s_mov_b32 s14, s25 +; GCN-O0-NEXT: s_mov_b32 s15, s24 +; GCN-O0-NEXT: s_mov_b32 s16, s23 +; GCN-O0-NEXT: s_mov_b32 s17, s22 +; GCN-O0-NEXT: s_mov_b32 s18, s21 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: ; implicit-def: $sgpr1 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr20 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v30, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v29, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v28, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v27, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v23, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v8, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v0 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: s_mov_b32 m0, s0 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: s_mov_b32 m0, s0 +; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v11 +; GCN-O0-NEXT: ; kill: def $vgpr26 killed $vgpr26 def $vgpr26_vgpr27_vgpr28_vgpr29 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v27, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v24 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v24, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[24:25], v[26:29] +; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v24, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s2 +; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[23:26] +; GCN-O0-NEXT: v_mov_b32_e32 v3, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v19 +; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v8, v3 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel store <7 x double> %v, ptr addrspace(1) %out @@ -844,6 +2426,275 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double16_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s2, s51 +; GCN-O0-NEXT: s_mov_b32 s3, s50 +; GCN-O0-NEXT: s_mov_b32 s6, s49 +; GCN-O0-NEXT: s_mov_b32 s7, s48 +; GCN-O0-NEXT: s_mov_b32 s8, s47 +; GCN-O0-NEXT: s_mov_b32 s9, s46 +; GCN-O0-NEXT: s_mov_b32 s10, s45 +; GCN-O0-NEXT: s_mov_b32 s11, s44 +; GCN-O0-NEXT: s_mov_b32 s12, s43 +; GCN-O0-NEXT: s_mov_b32 s13, s42 +; GCN-O0-NEXT: s_mov_b32 s14, s41 +; GCN-O0-NEXT: s_mov_b32 s15, s40 +; GCN-O0-NEXT: s_mov_b32 s16, s39 +; GCN-O0-NEXT: s_mov_b32 s17, s38 +; GCN-O0-NEXT: s_mov_b32 s18, s37 +; GCN-O0-NEXT: s_mov_b32 s19, s36 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s20, s51 +; GCN-O0-NEXT: s_mov_b32 s21, s50 +; GCN-O0-NEXT: s_mov_b32 s22, s49 +; GCN-O0-NEXT: s_mov_b32 s23, s48 +; GCN-O0-NEXT: s_mov_b32 s24, s47 +; GCN-O0-NEXT: s_mov_b32 s25, s46 +; GCN-O0-NEXT: s_mov_b32 s26, s45 +; GCN-O0-NEXT: s_mov_b32 s27, s44 +; GCN-O0-NEXT: s_mov_b32 s28, s43 +; GCN-O0-NEXT: s_mov_b32 s29, s42 +; GCN-O0-NEXT: s_mov_b32 s30, s41 +; GCN-O0-NEXT: s_mov_b32 s31, s40 +; GCN-O0-NEXT: s_mov_b32 s33, s39 +; GCN-O0-NEXT: s_mov_b32 s34, s38 +; GCN-O0-NEXT: s_mov_b32 s35, s37 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 +; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 +; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 +; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 +; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124 +; GCN-O0-NEXT: s_mov_b32 s3, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v35 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s2, s0 +; GCN-O0-NEXT: s_mov_b32 s3, s1 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s2, s2, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel store <16 x double> %v, ptr addrspace(1) %out @@ -939,6 +2790,277 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: double15_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x114 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s6, s1 +; GCN-O0-NEXT: s_mov_b32 s7, s0 +; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x104 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s8, s15 +; GCN-O0-NEXT: s_mov_b32 s9, s14 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0xe4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s12, s27 +; GCN-O0-NEXT: s_mov_b32 s13, s26 +; GCN-O0-NEXT: s_mov_b32 s14, s25 +; GCN-O0-NEXT: s_mov_b32 s15, s24 +; GCN-O0-NEXT: s_mov_b32 s16, s23 +; GCN-O0-NEXT: s_mov_b32 s17, s22 +; GCN-O0-NEXT: s_mov_b32 s18, s21 +; GCN-O0-NEXT: s_mov_b32 s19, s20 +; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s20, s51 +; GCN-O0-NEXT: s_mov_b32 s21, s50 +; GCN-O0-NEXT: s_mov_b32 s22, s49 +; GCN-O0-NEXT: s_mov_b32 s23, s48 +; GCN-O0-NEXT: s_mov_b32 s24, s47 +; GCN-O0-NEXT: s_mov_b32 s25, s46 +; GCN-O0-NEXT: s_mov_b32 s26, s45 +; GCN-O0-NEXT: s_mov_b32 s27, s44 +; GCN-O0-NEXT: s_mov_b32 s28, s43 +; GCN-O0-NEXT: s_mov_b32 s29, s42 +; GCN-O0-NEXT: s_mov_b32 s30, s41 +; GCN-O0-NEXT: s_mov_b32 s31, s40 +; GCN-O0-NEXT: s_mov_b32 s33, s39 +; GCN-O0-NEXT: s_mov_b32 s34, s38 +; GCN-O0-NEXT: s_mov_b32 s35, s37 +; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 +; GCN-O0-NEXT: ; implicit-def: $sgpr1 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr0 +; GCN-O0-NEXT: ; implicit-def: $sgpr37 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 +; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 +; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 +; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 +; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 +; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 +; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 +; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 +; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 +; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 +; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 +; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 +; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x124 +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: s_mov_b32 m0, s0 +; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 +; GCN-O0-NEXT: s_mov_b32 m0, s0 +; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v39, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v40, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v41, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v46, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v42, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v43, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v44, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v45, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v47, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v48, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v49, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v54, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v50, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v51, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v52, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v53, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v55, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v56, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v57, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v62, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v58, v19 +; GCN-O0-NEXT: ; kill: def $vgpr58 killed $vgpr58 def $vgpr58_vgpr59_vgpr60_vgpr61 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v59, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v60, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v61, v56 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v56, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[56:57], v[58:61] +; GCN-O0-NEXT: ; kill: def $vgpr55 killed $vgpr55 def $vgpr55_vgpr56_vgpr57_vgpr58 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v56, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v57, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v58, v51 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v52, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v51, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[51:52], v[55:58] +; GCN-O0-NEXT: ; kill: def $vgpr50 killed $vgpr50 def $vgpr50_vgpr51_vgpr52_vgpr53 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v51, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v52, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v53, v48 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v48, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[48:49], v[50:53] +; GCN-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 def $vgpr47_vgpr48_vgpr49_vgpr50 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v48, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v49, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v50, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v44, s3 +; GCN-O0-NEXT: v_mov_b32_e32 v43, s2 +; GCN-O0-NEXT: flat_store_dwordx4 v[43:44], v[47:50] +; GCN-O0-NEXT: ; kill: def $vgpr42 killed $vgpr42 def $vgpr42_vgpr43_vgpr44_vgpr45 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v43, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v44, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v45, v40 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v40, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[40:41], v[42:45] +; GCN-O0-NEXT: ; kill: def $vgpr39 killed $vgpr39 def $vgpr39_vgpr40_vgpr41_vgpr42 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v40, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v41, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v42, v3 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[39:42] +; GCN-O0-NEXT: v_mov_b32_e32 v3, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v35 +; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v8, v3 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: s_mov_b32 s4, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s5 +; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 +; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0x60 +; GCN-O0-NEXT: s_mov_b32 s0, s2 +; GCN-O0-NEXT: s_mov_b32 s1, s3 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s2, s5 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel store <15 x double> %v, ptr addrspace(1) %out @@ -994,6 +3116,63 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: bit4_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s14, -1 +; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 +; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-O0-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x30 +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_bfe_u32 s3, s4, 0x10001 +; GCN-O0-NEXT: s_bfe_u32 s5, s4, 0x20002 +; GCN-O0-NEXT: s_bfe_u32 s6, s4, 0x10003 +; GCN-O0-NEXT: s_mov_b32 s7, 3 +; GCN-O0-NEXT: s_and_b32 s7, s2, s7 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_or_b32 s2, s2, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s5 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: v_mov_b32_e32 v3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: buffer_store_byte v3, v0, s[12:15], 0 offen +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 +; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 +; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:3 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, v3, v4 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_mov_b32 s2, 3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: s_mov_b32 s2, 15 +; GCN-O0-NEXT: v_and_b32_e64 v2, v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x i1> %vec, i1 1, i32 %sel store <4 x i1> %v, ptr addrspace(1) %out @@ -1836,6 +4015,1599 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GCN-O0-LABEL: bit128_inselt: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s18, -1 +; GCN-O0-NEXT: s_mov_b32 s19, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s16, s16, s11 +; GCN-O0-NEXT: s_addc_u32 s17, s17, 0 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 +; GCN-O0-NEXT: s_mov_b32 s0, s4 +; GCN-O0-NEXT: s_mov_b32 s1, s5 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: s_mov_b32 s2, s7 +; GCN-O0-NEXT: s_add_u32 s0, s0, s3 +; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 +; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; GCN-O0-NEXT: s_mov_b32 s1, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_mov_b32 s1, 1 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:388 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v3, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v4, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v5, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v6, v0, 6, 1 +; GCN-O0-NEXT: s_mov_b32 s0, 7 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v7, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 53 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v8, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v9, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v10, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v11, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v12, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v13, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v14, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v15, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 54 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v16, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v17, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v18, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v19, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v20, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v21, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v22, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v23, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 55 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v24, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v25, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v26, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v27, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v28, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v29, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v30, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v31, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 56 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v32, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v33, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v34, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v35, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v36, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v37, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v38, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v39, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 57 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v40, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v41, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v42, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v43, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v44, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v45, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v46, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v47, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 58 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v48, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v49, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v50, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v51, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v52, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v53, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v54, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v55, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 59 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v56, v0, s1 +; GCN-O0-NEXT: v_bfe_u32 v57, v0, 1, 1 +; GCN-O0-NEXT: v_bfe_u32 v58, v0, 2, 1 +; GCN-O0-NEXT: v_bfe_u32 v59, v0, 3, 1 +; GCN-O0-NEXT: v_bfe_u32 v60, v0, 4, 1 +; GCN-O0-NEXT: v_bfe_u32 v61, v0, 5, 1 +; GCN-O0-NEXT: v_bfe_u32 v62, v0, 6, 1 +; GCN-O0-NEXT: v_lshrrev_b32_e64 v63, s0, v0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], 60 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:392 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:396 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:400 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:404 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:408 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:412 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:416 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 61 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:424 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:428 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:432 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:436 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:440 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:444 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:448 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 62 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:456 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:460 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:464 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:468 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:472 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:476 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:480 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 63 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:488 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:492 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:496 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:500 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:504 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:508 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:512 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 64 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:520 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:524 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:528 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:532 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:536 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:540 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:544 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x41 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:552 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:556 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:560 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:564 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:568 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:572 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:576 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x42 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:584 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:588 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:592 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:596 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:600 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:604 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:608 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x43 +; GCN-O0-NEXT: s_mov_b32 s2, s4 +; GCN-O0-NEXT: s_mov_b32 s3, s5 +; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_mov_b32 s6, s9 +; GCN-O0-NEXT: s_add_u32 s2, s2, s7 +; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 +; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, s1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_bfe_u32 v0, v0, 1, 1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_bfe_u32 v0, v0, 2, 1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_bfe_u32 v0, v0, 3, 1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_bfe_u32 v0, v0, 4, 1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_bfe_u32 v0, v0, 5, 1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_bfe_u32 v0, v0, 6, 1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 +; GCN-O0-NEXT: s_mov_b32 s3, 0x7f +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_and_b32 s3, s2, s3 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: s_add_i32 s2, s2, s3 +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:127 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:126 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:125 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:124 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:123 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:122 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:121 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:120 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:119 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:608 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:118 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:604 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:117 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:600 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:116 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:596 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:115 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:592 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:114 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:588 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:113 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:584 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:112 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:111 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:576 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:110 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:572 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:109 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:568 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:108 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:564 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:107 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:560 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:106 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:556 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:105 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:552 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:104 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:103 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:544 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:102 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:540 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:101 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:536 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:100 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:532 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:99 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:528 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:98 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:524 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:97 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:520 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:96 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:95 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:512 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:94 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:508 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:93 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:504 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:92 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:500 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:91 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:496 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:90 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:492 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:89 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:488 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:88 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:87 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:480 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:86 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:476 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:85 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:472 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:84 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:468 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:83 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:464 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:82 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:460 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:81 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:456 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:80 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:79 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:448 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:78 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:444 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:77 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:440 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:76 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:436 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:75 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:432 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:74 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:428 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:73 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:424 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:72 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:71 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:416 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:70 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:412 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:69 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:408 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:68 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:404 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:67 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:400 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:66 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:396 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:65 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:392 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:64 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:388 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_store_byte v63, off, s[16:19], 0 offset:63 +; GCN-O0-NEXT: buffer_store_byte v62, off, s[16:19], 0 offset:62 +; GCN-O0-NEXT: buffer_store_byte v61, off, s[16:19], 0 offset:61 +; GCN-O0-NEXT: buffer_store_byte v60, off, s[16:19], 0 offset:60 +; GCN-O0-NEXT: buffer_store_byte v59, off, s[16:19], 0 offset:59 +; GCN-O0-NEXT: buffer_store_byte v58, off, s[16:19], 0 offset:58 +; GCN-O0-NEXT: buffer_store_byte v57, off, s[16:19], 0 offset:57 +; GCN-O0-NEXT: buffer_store_byte v56, off, s[16:19], 0 offset:56 +; GCN-O0-NEXT: buffer_store_byte v55, off, s[16:19], 0 offset:55 +; GCN-O0-NEXT: buffer_store_byte v54, off, s[16:19], 0 offset:54 +; GCN-O0-NEXT: buffer_store_byte v53, off, s[16:19], 0 offset:53 +; GCN-O0-NEXT: buffer_store_byte v52, off, s[16:19], 0 offset:52 +; GCN-O0-NEXT: buffer_store_byte v51, off, s[16:19], 0 offset:51 +; GCN-O0-NEXT: buffer_store_byte v50, off, s[16:19], 0 offset:50 +; GCN-O0-NEXT: buffer_store_byte v49, off, s[16:19], 0 offset:49 +; GCN-O0-NEXT: buffer_store_byte v48, off, s[16:19], 0 offset:48 +; GCN-O0-NEXT: buffer_store_byte v47, off, s[16:19], 0 offset:47 +; GCN-O0-NEXT: buffer_store_byte v46, off, s[16:19], 0 offset:46 +; GCN-O0-NEXT: buffer_store_byte v45, off, s[16:19], 0 offset:45 +; GCN-O0-NEXT: buffer_store_byte v44, off, s[16:19], 0 offset:44 +; GCN-O0-NEXT: buffer_store_byte v43, off, s[16:19], 0 offset:43 +; GCN-O0-NEXT: buffer_store_byte v42, off, s[16:19], 0 offset:42 +; GCN-O0-NEXT: buffer_store_byte v41, off, s[16:19], 0 offset:41 +; GCN-O0-NEXT: buffer_store_byte v40, off, s[16:19], 0 offset:40 +; GCN-O0-NEXT: buffer_store_byte v39, off, s[16:19], 0 offset:39 +; GCN-O0-NEXT: buffer_store_byte v38, off, s[16:19], 0 offset:38 +; GCN-O0-NEXT: buffer_store_byte v37, off, s[16:19], 0 offset:37 +; GCN-O0-NEXT: buffer_store_byte v36, off, s[16:19], 0 offset:36 +; GCN-O0-NEXT: buffer_store_byte v35, off, s[16:19], 0 offset:35 +; GCN-O0-NEXT: buffer_store_byte v34, off, s[16:19], 0 offset:34 +; GCN-O0-NEXT: buffer_store_byte v33, off, s[16:19], 0 offset:33 +; GCN-O0-NEXT: buffer_store_byte v32, off, s[16:19], 0 offset:32 +; GCN-O0-NEXT: buffer_store_byte v31, off, s[16:19], 0 offset:31 +; GCN-O0-NEXT: buffer_store_byte v30, off, s[16:19], 0 offset:30 +; GCN-O0-NEXT: buffer_store_byte v29, off, s[16:19], 0 offset:29 +; GCN-O0-NEXT: buffer_store_byte v28, off, s[16:19], 0 offset:28 +; GCN-O0-NEXT: buffer_store_byte v27, off, s[16:19], 0 offset:27 +; GCN-O0-NEXT: buffer_store_byte v26, off, s[16:19], 0 offset:26 +; GCN-O0-NEXT: buffer_store_byte v25, off, s[16:19], 0 offset:25 +; GCN-O0-NEXT: buffer_store_byte v24, off, s[16:19], 0 offset:24 +; GCN-O0-NEXT: buffer_store_byte v23, off, s[16:19], 0 offset:23 +; GCN-O0-NEXT: buffer_store_byte v22, off, s[16:19], 0 offset:22 +; GCN-O0-NEXT: buffer_store_byte v21, off, s[16:19], 0 offset:21 +; GCN-O0-NEXT: buffer_store_byte v20, off, s[16:19], 0 offset:20 +; GCN-O0-NEXT: buffer_store_byte v19, off, s[16:19], 0 offset:19 +; GCN-O0-NEXT: buffer_store_byte v18, off, s[16:19], 0 offset:18 +; GCN-O0-NEXT: buffer_store_byte v17, off, s[16:19], 0 offset:17 +; GCN-O0-NEXT: buffer_store_byte v16, off, s[16:19], 0 offset:16 +; GCN-O0-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15 +; GCN-O0-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14 +; GCN-O0-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13 +; GCN-O0-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12 +; GCN-O0-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11 +; GCN-O0-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10 +; GCN-O0-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9 +; GCN-O0-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8 +; GCN-O0-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7 +; GCN-O0-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6 +; GCN-O0-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5 +; GCN-O0-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4 +; GCN-O0-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3 +; GCN-O0-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2 +; GCN-O0-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1 +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 +; GCN-O0-NEXT: v_mov_b32_e32 v3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 +; GCN-O0-NEXT: buffer_store_byte v3, v0, s[16:19], 0 offen +; GCN-O0-NEXT: buffer_load_ubyte v18, off, s[16:19], 0 offset:23 +; GCN-O0-NEXT: buffer_load_ubyte v19, off, s[16:19], 0 offset:22 +; GCN-O0-NEXT: buffer_load_ubyte v20, off, s[16:19], 0 offset:21 +; GCN-O0-NEXT: buffer_load_ubyte v21, off, s[16:19], 0 offset:20 +; GCN-O0-NEXT: buffer_load_ubyte v22, off, s[16:19], 0 offset:19 +; GCN-O0-NEXT: buffer_load_ubyte v23, off, s[16:19], 0 offset:18 +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:1 +; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:2 +; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:3 +; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:4 +; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:5 +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:6 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:7 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:8 +; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[16:19], 0 offset:9 +; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:10 +; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:11 +; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:12 +; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:13 +; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:14 +; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:15 +; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[16:19], 0 offset:16 +; GCN-O0-NEXT: buffer_load_ubyte v24, off, s[16:19], 0 offset:17 +; GCN-O0-NEXT: buffer_load_ubyte v26, off, s[16:19], 0 offset:31 +; GCN-O0-NEXT: buffer_load_ubyte v27, off, s[16:19], 0 offset:30 +; GCN-O0-NEXT: buffer_load_ubyte v28, off, s[16:19], 0 offset:29 +; GCN-O0-NEXT: buffer_load_ubyte v29, off, s[16:19], 0 offset:28 +; GCN-O0-NEXT: buffer_load_ubyte v30, off, s[16:19], 0 offset:27 +; GCN-O0-NEXT: buffer_load_ubyte v31, off, s[16:19], 0 offset:26 +; GCN-O0-NEXT: buffer_load_ubyte v32, off, s[16:19], 0 offset:25 +; GCN-O0-NEXT: buffer_load_ubyte v25, off, s[16:19], 0 offset:24 +; GCN-O0-NEXT: buffer_load_ubyte v34, off, s[16:19], 0 offset:39 +; GCN-O0-NEXT: buffer_load_ubyte v35, off, s[16:19], 0 offset:38 +; GCN-O0-NEXT: buffer_load_ubyte v36, off, s[16:19], 0 offset:37 +; GCN-O0-NEXT: buffer_load_ubyte v37, off, s[16:19], 0 offset:36 +; GCN-O0-NEXT: buffer_load_ubyte v38, off, s[16:19], 0 offset:35 +; GCN-O0-NEXT: buffer_load_ubyte v39, off, s[16:19], 0 offset:34 +; GCN-O0-NEXT: buffer_load_ubyte v40, off, s[16:19], 0 offset:33 +; GCN-O0-NEXT: buffer_load_ubyte v33, off, s[16:19], 0 offset:32 +; GCN-O0-NEXT: buffer_load_ubyte v42, off, s[16:19], 0 offset:47 +; GCN-O0-NEXT: buffer_load_ubyte v43, off, s[16:19], 0 offset:46 +; GCN-O0-NEXT: buffer_load_ubyte v44, off, s[16:19], 0 offset:45 +; GCN-O0-NEXT: buffer_load_ubyte v45, off, s[16:19], 0 offset:44 +; GCN-O0-NEXT: buffer_load_ubyte v46, off, s[16:19], 0 offset:43 +; GCN-O0-NEXT: buffer_load_ubyte v47, off, s[16:19], 0 offset:42 +; GCN-O0-NEXT: buffer_load_ubyte v48, off, s[16:19], 0 offset:41 +; GCN-O0-NEXT: buffer_load_ubyte v41, off, s[16:19], 0 offset:40 +; GCN-O0-NEXT: buffer_load_ubyte v50, off, s[16:19], 0 offset:55 +; GCN-O0-NEXT: buffer_load_ubyte v51, off, s[16:19], 0 offset:54 +; GCN-O0-NEXT: buffer_load_ubyte v52, off, s[16:19], 0 offset:53 +; GCN-O0-NEXT: buffer_load_ubyte v53, off, s[16:19], 0 offset:52 +; GCN-O0-NEXT: buffer_load_ubyte v54, off, s[16:19], 0 offset:51 +; GCN-O0-NEXT: buffer_load_ubyte v55, off, s[16:19], 0 offset:50 +; GCN-O0-NEXT: buffer_load_ubyte v56, off, s[16:19], 0 offset:49 +; GCN-O0-NEXT: buffer_load_ubyte v49, off, s[16:19], 0 offset:48 +; GCN-O0-NEXT: buffer_load_ubyte v58, off, s[16:19], 0 offset:63 +; GCN-O0-NEXT: buffer_load_ubyte v59, off, s[16:19], 0 offset:62 +; GCN-O0-NEXT: buffer_load_ubyte v60, off, s[16:19], 0 offset:61 +; GCN-O0-NEXT: buffer_load_ubyte v61, off, s[16:19], 0 offset:60 +; GCN-O0-NEXT: buffer_load_ubyte v62, off, s[16:19], 0 offset:59 +; GCN-O0-NEXT: buffer_load_ubyte v63, off, s[16:19], 0 offset:58 +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:57 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v57, off, s[16:19], 0 offset:56 +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:71 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:70 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:69 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:68 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:67 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:66 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:65 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:64 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:79 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:78 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:77 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:76 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:75 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:74 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:73 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:72 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:87 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:86 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:236 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:85 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:84 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:216 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:83 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:220 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:82 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:224 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:81 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:232 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:80 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:95 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:240 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:94 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:268 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:93 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:244 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:92 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:248 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:91 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:252 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:90 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:256 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:89 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:264 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:88 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:103 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:272 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:102 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:300 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:101 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:276 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:100 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:280 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:99 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:98 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:288 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:97 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:296 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:96 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:111 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:304 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:110 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:332 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:109 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:308 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:108 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:312 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:107 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:316 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:106 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:320 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:105 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:328 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:104 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:119 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:336 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:118 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:364 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:117 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:340 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:116 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:344 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:115 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:348 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:114 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:352 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:113 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:360 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:112 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:127 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:368 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:126 +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:125 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:372 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:124 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:376 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:123 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:380 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:122 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:384 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:121 +; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:120 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:384 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: s_mov_b32 s7, 2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:380 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: s_mov_b32 s6, 3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:376 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: s_mov_b32 s5, 4 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:372 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: s_mov_b32 s4, 5 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:368 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: s_mov_b32 s3, 6 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_mov_b32 s2, 7 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 15 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:364 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:360 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:352 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:348 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:344 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:340 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:336 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 14 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:332 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:328 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:320 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:316 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:312 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:308 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:304 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 13 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:300 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:296 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:288 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:280 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:276 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:272 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 12 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:268 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:264 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:256 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:252 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:248 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:244 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:240 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 11 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:236 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:232 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:224 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:220 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:216 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:212 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 10 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 9 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 8 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v57, v57, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v0, v3, v0 +; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v0 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_and_b32_e64 v63, v63, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v63, s7, v63 +; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v63 +; GCN-O0-NEXT: v_and_b32_e64 v62, v62, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v62, s6, v62 +; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v62 +; GCN-O0-NEXT: v_and_b32_e64 v61, v61, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v61, s5, v61 +; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v61 +; GCN-O0-NEXT: v_and_b32_e64 v60, v60, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v60, s4, v60 +; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v60 +; GCN-O0-NEXT: v_and_b32_e64 v59, v59, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v59, s3, v59 +; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v59 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v58, s2, v58 +; GCN-O0-NEXT: v_or_b32_e64 v59, v57, v58 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 7 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v58, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v57, s8 +; GCN-O0-NEXT: flat_store_byte v[57:58], v59 +; GCN-O0-NEXT: v_and_b32_e64 v49, v49, v3 +; GCN-O0-NEXT: v_and_b32_e64 v56, v56, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v56, v3, v56 +; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v56 +; GCN-O0-NEXT: v_and_b32_e64 v55, v55, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v55, s7, v55 +; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v55 +; GCN-O0-NEXT: v_and_b32_e64 v54, v54, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v54, s6, v54 +; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v54 +; GCN-O0-NEXT: v_and_b32_e64 v53, v53, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v53, s5, v53 +; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v53 +; GCN-O0-NEXT: v_and_b32_e64 v52, v52, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v52, s4, v52 +; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v52 +; GCN-O0-NEXT: v_and_b32_e64 v51, v51, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v51, s3, v51 +; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v51 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v50, s2, v50 +; GCN-O0-NEXT: v_or_b32_e64 v51, v49, v50 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 6 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v50, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v49, s8 +; GCN-O0-NEXT: flat_store_byte v[49:50], v51 +; GCN-O0-NEXT: v_and_b32_e64 v41, v41, v3 +; GCN-O0-NEXT: v_and_b32_e64 v48, v48, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v48, v3, v48 +; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v48 +; GCN-O0-NEXT: v_and_b32_e64 v47, v47, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v47, s7, v47 +; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v47 +; GCN-O0-NEXT: v_and_b32_e64 v46, v46, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v46, s6, v46 +; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v46 +; GCN-O0-NEXT: v_and_b32_e64 v45, v45, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v45, s5, v45 +; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v45 +; GCN-O0-NEXT: v_and_b32_e64 v44, v44, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v44, s4, v44 +; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v44 +; GCN-O0-NEXT: v_and_b32_e64 v43, v43, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v43, s3, v43 +; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v43 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v42, s2, v42 +; GCN-O0-NEXT: v_or_b32_e64 v43, v41, v42 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 5 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v42, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v41, s8 +; GCN-O0-NEXT: flat_store_byte v[41:42], v43 +; GCN-O0-NEXT: v_and_b32_e64 v33, v33, v3 +; GCN-O0-NEXT: v_and_b32_e64 v40, v40, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v40, v3, v40 +; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v40 +; GCN-O0-NEXT: v_and_b32_e64 v39, v39, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v39, s7, v39 +; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v39 +; GCN-O0-NEXT: v_and_b32_e64 v38, v38, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v38, s6, v38 +; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v38 +; GCN-O0-NEXT: v_and_b32_e64 v37, v37, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v37, s5, v37 +; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v37 +; GCN-O0-NEXT: v_and_b32_e64 v36, v36, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v36, s4, v36 +; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v36 +; GCN-O0-NEXT: v_and_b32_e64 v35, v35, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v35, s3, v35 +; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v35 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v34, s2, v34 +; GCN-O0-NEXT: v_or_b32_e64 v35, v33, v34 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 4 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v34, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v33, s8 +; GCN-O0-NEXT: flat_store_byte v[33:34], v35 +; GCN-O0-NEXT: v_and_b32_e64 v25, v25, v3 +; GCN-O0-NEXT: v_and_b32_e64 v32, v32, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v32, v3, v32 +; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v32 +; GCN-O0-NEXT: v_and_b32_e64 v31, v31, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v31, s7, v31 +; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v31 +; GCN-O0-NEXT: v_and_b32_e64 v30, v30, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v30, s6, v30 +; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v30 +; GCN-O0-NEXT: v_and_b32_e64 v29, v29, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v29, s5, v29 +; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v29 +; GCN-O0-NEXT: v_and_b32_e64 v28, v28, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v28, s4, v28 +; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v28 +; GCN-O0-NEXT: v_and_b32_e64 v27, v27, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v27, s3, v27 +; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v27 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v26, s2, v26 +; GCN-O0-NEXT: v_or_b32_e64 v27, v25, v26 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 3 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v26, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v25, s8 +; GCN-O0-NEXT: flat_store_byte v[25:26], v27 +; GCN-O0-NEXT: v_and_b32_e64 v17, v17, v3 +; GCN-O0-NEXT: v_and_b32_e64 v24, v24, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v24, v3, v24 +; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v24 +; GCN-O0-NEXT: v_and_b32_e64 v23, v23, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v23, s7, v23 +; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v23 +; GCN-O0-NEXT: v_and_b32_e64 v22, v22, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v22, s6, v22 +; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v22 +; GCN-O0-NEXT: v_and_b32_e64 v21, v21, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v21, s5, v21 +; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v21 +; GCN-O0-NEXT: v_and_b32_e64 v20, v20, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v20, s4, v20 +; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GCN-O0-NEXT: v_and_b32_e64 v19, v19, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v19, s3, v19 +; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v19 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v18, s2, v18 +; GCN-O0-NEXT: v_or_b32_e64 v19, v17, v18 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 2 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v18, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v17, s8 +; GCN-O0-NEXT: flat_store_byte v[17:18], v19 +; GCN-O0-NEXT: v_and_b32_e64 v9, v9, v3 +; GCN-O0-NEXT: v_and_b32_e64 v16, v16, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v16, v3, v16 +; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v16 +; GCN-O0-NEXT: v_and_b32_e64 v15, v15, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v15, s7, v15 +; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v15 +; GCN-O0-NEXT: v_and_b32_e64 v14, v14, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v14, s6, v14 +; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v14 +; GCN-O0-NEXT: v_and_b32_e64 v13, v13, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v13, s5, v13 +; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GCN-O0-NEXT: v_and_b32_e64 v12, v12, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v12, s4, v12 +; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GCN-O0-NEXT: v_and_b32_e64 v11, v11, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v11, s3, v11 +; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v11 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v10, s2, v10 +; GCN-O0-NEXT: v_or_b32_e64 v11, v9, v10 +; GCN-O0-NEXT: s_mov_b64 s[12:13], 1 +; GCN-O0-NEXT: s_mov_b32 s8, s0 +; GCN-O0-NEXT: s_mov_b32 s9, s1 +; GCN-O0-NEXT: s_mov_b32 s11, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s13 +; GCN-O0-NEXT: s_add_u32 s8, s8, s11 +; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s9, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 +; GCN-O0-NEXT: flat_store_byte v[9:10], v11 +; GCN-O0-NEXT: s_waitcnt vmcnt(7) +; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 +; GCN-O0-NEXT: v_and_b32_e64 v8, v8, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v8, v3, v8 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v8 +; GCN-O0-NEXT: v_and_b32_e64 v7, v7, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v7, s7, v7 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GCN-O0-NEXT: v_and_b32_e64 v6, v6, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v6, s6, v6 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v6 +; GCN-O0-NEXT: v_and_b32_e64 v5, v5, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v5, s5, v5 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v5 +; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, s4, v4 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4 +; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 +; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 +; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 +; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 +; GCN-O0-NEXT: flat_store_byte v[0:1], v2 +; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <128 x i1> %vec, i1 1, i32 %sel store <128 x i1> %v, ptr addrspace(1) %out @@ -1910,6 +5682,361 @@ define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) { ; GCN-NEXT: v_cndmask_b32_e64 v30, 1.0, v30, s[58:59] ; GCN-NEXT: v_cndmask_b32_e64 v31, 1.0, v31, s[60:61] ; GCN-NEXT: ; return to shader part epilog +; +; GCN-O0-LABEL: float32_inselt_vec: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-O0-NEXT: s_mov_b32 s10, -1 +; GCN-O0-NEXT: s_mov_b32 s11, 0xe80000 +; GCN-O0-NEXT: s_add_u32 s8, s8, s0 +; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 +; GCN-O0-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v32, v31 +; GCN-O0-NEXT: v_mov_b32_e32 v33, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v34, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v35, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v36, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v37, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v38, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v39, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v40, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v41, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v42, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v43, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v44, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v45, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v46, v17 +; GCN-O0-NEXT: v_mov_b32_e32 v47, v16 +; GCN-O0-NEXT: v_mov_b32_e32 v48, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v49, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v50, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v51, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v52, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v53, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v54, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v55, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v56, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v57, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v58, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v59, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v60, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v61, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v62, v1 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v55 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v54 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v53 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v52 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v51 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v50 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v49 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v48 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v32, 1.0 +; GCN-O0-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[0:1], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr64 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v64, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v64, s1, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v64, off, s[8:11], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 +; GCN-O0-NEXT: .LBB22_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v64, off, s[8:11], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v64, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v64, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s2, v33 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v33 +; GCN-O0-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GCN-O0-NEXT: s_mov_b32 m0, s2 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v32 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-O0-NEXT: v_writelane_b32 v64, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v64, s3, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v64, off, s[8:11], 0 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_execnz .LBB22_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v64, off, s[8:11], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v64, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v64, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v36, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v40, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v44, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v48, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v52, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v56, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v60, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v31 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v32 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v33 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v34 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v35 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v36 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v37 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v38 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v39 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v40 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v41 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v42 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v43 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v44 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v45 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v46 +; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 +; GCN-O0-NEXT: v_mov_b32_e32 v17, v48 +; GCN-O0-NEXT: s_waitcnt vmcnt(13) +; GCN-O0-NEXT: v_mov_b32_e32 v18, v49 +; GCN-O0-NEXT: s_waitcnt vmcnt(12) +; GCN-O0-NEXT: v_mov_b32_e32 v19, v50 +; GCN-O0-NEXT: s_waitcnt vmcnt(11) +; GCN-O0-NEXT: v_mov_b32_e32 v20, v51 +; GCN-O0-NEXT: s_waitcnt vmcnt(10) +; GCN-O0-NEXT: v_mov_b32_e32 v21, v52 +; GCN-O0-NEXT: s_waitcnt vmcnt(9) +; GCN-O0-NEXT: v_mov_b32_e32 v22, v53 +; GCN-O0-NEXT: s_waitcnt vmcnt(8) +; GCN-O0-NEXT: v_mov_b32_e32 v23, v54 +; GCN-O0-NEXT: s_waitcnt vmcnt(7) +; GCN-O0-NEXT: v_mov_b32_e32 v24, v55 +; GCN-O0-NEXT: s_waitcnt vmcnt(6) +; GCN-O0-NEXT: v_mov_b32_e32 v25, v56 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v26, v57 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v27, v58 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v28, v59 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v29, v60 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v30, v61 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v31, v62 +; GCN-O0-NEXT: ; return to shader part epilog entry: %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel ret <32 x float> %v @@ -1945,7 +6072,1843 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc ; GCN-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: double8_inselt_vec: +; GCN-O0: ; %bb.0: ; %entry +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: v_mov_b32_e32 v17, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v18, v14 +; GCN-O0-NEXT: v_mov_b32_e32 v19, v13 +; GCN-O0-NEXT: v_mov_b32_e32 v20, v12 +; GCN-O0-NEXT: v_mov_b32_e32 v21, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v22, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v23, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v24, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v25, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v26, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v27, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v28, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v29, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v30, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v31, v1 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v31 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v30 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v29 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v28 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v27 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v26 +; GCN-O0-NEXT: v_mov_b32_e32 v7, v25 +; GCN-O0-NEXT: v_mov_b32_e32 v8, v24 +; GCN-O0-NEXT: v_mov_b32_e32 v9, v23 +; GCN-O0-NEXT: v_mov_b32_e32 v10, v22 +; GCN-O0-NEXT: v_mov_b32_e32 v11, v21 +; GCN-O0-NEXT: v_mov_b32_e32 v12, v20 +; GCN-O0-NEXT: v_mov_b32_e32 v13, v19 +; GCN-O0-NEXT: v_mov_b32_e32 v14, v18 +; GCN-O0-NEXT: v_mov_b32_e32 v15, v17 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, 1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v16, s4, v16 +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 +; GCN-O0-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v33, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v33, s5, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s4 +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v33, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v33, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB23_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v33, 4 +; GCN-O0-NEXT: v_readlane_b32 s5, v33, 5 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v16 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v33, s6, 4 +; GCN-O0-NEXT: v_writelane_b32 v33, s7, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB23_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v33, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v33, 3 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v33, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v33, 1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b32 s4, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v16, s4 +; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: v_writelane_b32 v33, s4, 6 +; GCN-O0-NEXT: v_writelane_b32 v33, s5, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB23_4: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v33, 8 +; GCN-O0-NEXT: v_readlane_b32 s5, v33, 9 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v1, v16 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v33, s6, 8 +; GCN-O0-NEXT: v_writelane_b32 v33, s7, 9 +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB23_4 +; GCN-O0-NEXT: ; %bb.5: +; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v33, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v33, 7 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.6: +; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(14) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v15 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v16 +; GCN-O0-NEXT: s_waitcnt vmcnt(13) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v17 +; GCN-O0-NEXT: s_waitcnt vmcnt(12) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v18 +; GCN-O0-NEXT: s_waitcnt vmcnt(11) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v19 +; GCN-O0-NEXT: s_waitcnt vmcnt(10) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v20 +; GCN-O0-NEXT: s_waitcnt vmcnt(9) +; GCN-O0-NEXT: v_mov_b32_e32 v6, v21 +; GCN-O0-NEXT: s_waitcnt vmcnt(8) +; GCN-O0-NEXT: v_mov_b32_e32 v7, v22 +; GCN-O0-NEXT: s_waitcnt vmcnt(7) +; GCN-O0-NEXT: v_mov_b32_e32 v8, v23 +; GCN-O0-NEXT: s_waitcnt vmcnt(6) +; GCN-O0-NEXT: v_mov_b32_e32 v9, v24 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v10, v25 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v11, v26 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v12, v27 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v13, v28 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v14, v29 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v15, v30 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] entry: %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel ret <8 x double> %v } + +define <3 x i32> @insert_dyn_i32_3(<3 x i32> inreg %arg, i32 %idx, i32 %val) { +; GCN-LABEL: insert_dyn_i32_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_i32_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v4 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v3 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB24_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <3 x i32> %arg, i32 %val, i32 %idx + ret <3 x i32> %x +} + +define <3 x i32> @insert_dyn_inreg_i32_3(<3 x i32> inreg %arg, i32 inreg %idx, i32 %val) { +; GCN-LABEL: insert_dyn_inreg_i32_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s19, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, 1 +; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_i32_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: s_mov_b32 m0, s19 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <3 x i32> %arg, i32 %val, i32 %idx + ret <3 x i32> %x +} + +define <3 x float> @insert_dyn_float_3(<3 x float> inreg %arg, i32 %idx, float %val) { +; GCN-LABEL: insert_dyn_float_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_float_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v4 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v3 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB26_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <3 x float> %arg, float %val, i32 %idx + ret <3 x float> %x +} + +define <3 x float> @insert_dyn_inreg_float_3(<3 x float> inreg %arg, i32 inreg %idx, float %val) { +; GCN-LABEL: insert_dyn_inreg_float_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s19, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, 1 +; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_float_3: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: s_mov_b32 m0, s19 +; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <3 x float> %arg, float %val, i32 %idx + ret <3 x float> %x +} + +define <5 x i32> @insert_dyn_i32_5(<5 x i32> inreg %arg, i32 %idx, i32 %val) { +; GCN-LABEL: insert_dyn_i32_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_i32_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v5 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB28_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <5 x i32> %arg, i32 %val, i32 %idx + ret <5 x i32> %x +} + +define <5 x i32> @insert_dyn_inreg_i32_5(<5 x i32> inreg %arg, i32 inreg %idx, i32 %val) { +; GCN-LABEL: insert_dyn_inreg_i32_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s21, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_i32_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s8 +; GCN-O0-NEXT: s_mov_b32 m0, s21 +; GCN-O0-NEXT: v_movreld_b32_e32 v4, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <5 x i32> %arg, i32 %val, i32 %idx + ret <5 x i32> %x +} + +define <5 x float> @insert_dyn_float_5(<5 x float> inreg %arg, i32 %idx, float %val) { +; GCN-LABEL: insert_dyn_float_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_float_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB30_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v5 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB30_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <5 x float> %arg, float %val, i32 %idx + ret <5 x float> %x +} + +define <5 x float> @insert_dyn_inreg_float_5(<5 x float> inreg %arg, i32 inreg %idx, float %val) { +; GCN-LABEL: insert_dyn_inreg_float_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s21, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 1 +; GCN-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, 4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_float_5: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s8 +; GCN-O0-NEXT: s_mov_b32 m0, s21 +; GCN-O0-NEXT: v_movreld_b32_e32 v4, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <5 x float> %arg, float %val, i32 %idx + ret <5 x float> %x +} + +define <6 x i32> @insert_dyn_i32_6(<6 x i32> inreg %arg, i32 %idx, i32 %val) { +; GCN-LABEL: insert_dyn_i32_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_i32_6: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v11, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v11, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB32_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v11, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v11, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v6 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v11, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v11, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB32_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v11, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v11, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <6 x i32> %arg, i32 %val, i32 %idx + ret <6 x i32> %x +} + +define <6 x i32> @insert_dyn_inreg_i32_6(<6 x i32> inreg %arg, i32 inreg %idx, i32 %val) { +; GCN-LABEL: insert_dyn_inreg_i32_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s22, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 1 +; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_i32_6: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: s_mov_b32 m0, s22 +; GCN-O0-NEXT: v_movreld_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <6 x i32> %arg, i32 %val, i32 %idx + ret <6 x i32> %x +} + +define <6 x float> @insert_dyn_float_6(<6 x float> inreg %arg, i32 %idx, float %val) { +; GCN-LABEL: insert_dyn_float_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_float_6: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v11, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v11, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB34_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v11, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v11, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v6 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v11, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v11, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB34_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v11, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v11, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <6 x float> %arg, float %val, i32 %idx + ret <6 x float> %x +} + +define <6 x float> @insert_dyn_inreg_float_6(<6 x float> inreg %arg, i32 inreg %idx, float %val) { +; GCN-LABEL: insert_dyn_inreg_float_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s22, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 1 +; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s22, 5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v6 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_float_6: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 +; GCN-O0-NEXT: s_mov_b32 m0, s22 +; GCN-O0-NEXT: v_movreld_b32_e32 v5, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <6 x float> %arg, float %val, i32 %idx + ret <6 x float> %x +} + +define <7 x i32> @insert_dyn_i32_7(<7 x i32> inreg %arg, i32 %idx, i32 %val) { +; GCN-LABEL: insert_dyn_i32_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v8, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_i32_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v13, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v13, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB36_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v13, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v13, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v8 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v8 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v7 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v13, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v13, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB36_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v13, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v13, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(6) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <7 x i32> %arg, i32 %val, i32 %idx + ret <7 x i32> %x +} + +define <7 x i32> @insert_dyn_inreg_i32_7(<7 x i32> inreg %arg, i32 inreg %idx, i32 %val) { +; GCN-LABEL: insert_dyn_inreg_i32_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s23, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 1 +; GCN-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 6 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v7 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_i32_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 +; GCN-O0-NEXT: s_mov_b32 m0, s23 +; GCN-O0-NEXT: v_movreld_b32_e32 v6, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <7 x i32> %arg, i32 %val, i32 %idx + ret <7 x i32> %x +} + +define <7 x float> @insert_dyn_float_7(<7 x float> inreg %arg, i32 %idx, float %val) { +; GCN-LABEL: insert_dyn_float_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v8, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_float_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 +; GCN-O0-NEXT: s_mov_b64 s[4:5], exec +; GCN-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane +; GCN-O0-NEXT: v_writelane_b32 v13, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v13, s5, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-O0-NEXT: .LBB38_1: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v13, 2 +; GCN-O0-NEXT: v_readlane_b32 s5, v13, 3 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s6, v8 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v8 +; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GCN-O0-NEXT: s_mov_b32 m0, s6 +; GCN-O0-NEXT: v_movreld_b32_e32 v0, v7 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-O0-NEXT: v_writelane_b32 v13, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v13, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_execnz .LBB38_1 +; GCN-O0-NEXT: ; %bb.2: +; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v13, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v13, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: +; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(6) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 +; GCN-O0-NEXT: s_waitcnt vmcnt(5) +; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 +; GCN-O0-NEXT: s_waitcnt vmcnt(4) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 +; GCN-O0-NEXT: s_waitcnt vmcnt(3) +; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 +; GCN-O0-NEXT: s_waitcnt vmcnt(2) +; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <7 x float> %arg, float %val, i32 %idx + ret <7 x float> %x +} + +define <7 x float> @insert_dyn_inreg_float_7(<7 x float> inreg %arg, i32 inreg %idx, float %val) { +; GCN-LABEL: insert_dyn_inreg_float_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s23, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 1 +; GCN-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, 6 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v7 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: insert_dyn_inreg_float_7: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, s16 +; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: s_mov_b32 s5, s17 +; GCN-O0-NEXT: s_mov_b32 s6, s18 +; GCN-O0-NEXT: s_mov_b32 s7, s19 +; GCN-O0-NEXT: s_mov_b32 s8, s20 +; GCN-O0-NEXT: s_mov_b32 s9, s21 +; GCN-O0-NEXT: s_mov_b32 s10, s22 +; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; GCN-O0-NEXT: v_mov_b32_e32 v12, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v11, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v10, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v9, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v8, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v7, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 +; GCN-O0-NEXT: s_mov_b32 m0, s23 +; GCN-O0-NEXT: v_movreld_b32_e32 v6, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 +; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %x = insertelement <7 x float> %arg, float %val, i32 %idx + ret <7 x float> %x +} From 2c1decb7f76661cf3b77b46b7ac3816401ece3cd Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 11 Dec 2025 03:31:22 -0500 Subject: [PATCH 31/49] [libc++] Don't instantiate __split_buffer with an allocator reference (#171651) Allocators should be extremely cheap, if not free, to copy. Furthermore, we have requirements on allocator types that copies must compare equal, and that move and copy must be the same. Hence, taking an allocator by reference should not provide benefits beyond making a copy of it. However, taking the allocator by reference leads to complexity in __split_buffer, which can be removed if we stop using that pattern. --- libcxx/include/__split_buffer | 45 ++++++++++++++------------------ libcxx/include/__vector/vector.h | 34 ++++++++++++------------ libcxx/include/deque | 16 ++++++------ 3 files changed, 45 insertions(+), 50 deletions(-) diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer index 1e05e4df8ba0f..d6176f8ca2749 100644 --- a/libcxx/include/__split_buffer +++ b/libcxx/include/__split_buffer @@ -33,7 +33,6 @@ #include <__type_traits/is_swappable.h> #include <__type_traits/is_trivially_destructible.h> #include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/remove_reference.h> #include <__utility/forward.h> #include <__utility/move.h> @@ -54,8 +53,7 @@ class __split_buffer_pointer_layout { protected: using value_type = _Tp; using allocator_type = _Allocator; - using __alloc_rr _LIBCPP_NODEBUG = __libcpp_remove_reference_t; - using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<__alloc_rr>; + using __alloc_traits _LIBCPP_NODEBUG = allocator_traits; using reference = value_type&; using const_reference = const value_type&; using size_type = typename __alloc_traits::size_type; @@ -159,9 +157,9 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { return *(__end_ - 1); } _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_without_allocator( - __split_buffer_pointer_layout<__split_buffer, + __split_buffer_pointer_layout<__split_buffer, value_type, - __alloc_rr&>& __other) _NOEXCEPT { + allocator_type>& __other) _NOEXCEPT { std::swap(__front_cap_, __other.__front_cap_); std::swap(__begin_, __other.__begin_); std::swap(__back_cap_, __other.__back_cap_); @@ -207,8 +205,7 @@ class __split_buffer_size_layout { protected: using value_type = _Tp; using allocator_type = _Allocator; - using __alloc_rr _LIBCPP_NODEBUG = __libcpp_remove_reference_t; - using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<__alloc_rr>; + using __alloc_traits _LIBCPP_NODEBUG = allocator_traits; using reference = value_type&; using const_reference = const value_type&; using size_type = typename __alloc_traits::size_type; @@ -316,9 +313,9 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_without_allocator( - __split_buffer_pointer_layout<__split_buffer, + __split_buffer_pointer_layout<__split_buffer, value_type, - __alloc_rr&>& __other) _NOEXCEPT { + allocator_type>& __other) _NOEXCEPT { std::swap(__front_cap_, __other.__front_cap_); std::swap(__begin_, __other.__begin_); std::swap(__cap_, __other.__cap_); @@ -386,8 +383,7 @@ private: // protected: // using value_type = _Tp; // using allocator_type = _Allocator; -// using __alloc_rr = __libcpp_remove_reference_t; -// using __alloc_traits = allocator_traits<__alloc_rr>; +// using __alloc_traits = allocator_traits; // using reference = value_type&; // using const_reference = const value_type&; // using size_type = typename __alloc_traits::size_type; @@ -462,7 +458,6 @@ public: using __base_type::__set_sentinel; using __base_type::__set_valid_range; - using typename __base_type::__alloc_rr; using typename __base_type::__alloc_traits; using typename __base_type::allocator_type; using typename __base_type::const_iterator; @@ -489,18 +484,18 @@ public: _LIBCPP_HIDE_FROM_ABI __split_buffer() = default; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(__alloc_rr& __a) : __base_type(__a) {} + _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(allocator_type& __a) : __base_type(__a) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const __alloc_rr& __a) + _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const allocator_type& __a) : __base_type(__a) {} _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - __split_buffer(size_type __cap, size_type __start, __alloc_rr& __a); + __split_buffer(size_type __cap, size_type __start, allocator_type& __a); _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c) _NOEXCEPT_(is_nothrow_move_constructible::value); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c, const __alloc_rr& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c, const allocator_type& __a); _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer& operator=(__split_buffer&& __c) _NOEXCEPT_((__alloc_traits::propagate_on_container_move_assignment::value && @@ -560,7 +555,7 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last, true_type) _NOEXCEPT; _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer& __x) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>); + _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v); _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __invariants() const { if (__front_cap() == nullptr) { @@ -589,7 +584,7 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __swap_without_allocator(__split_buffer& __other) _NOEXCEPT { + __swap_without_allocator(__split_buffer& __other) _NOEXCEPT { __base_type::__swap_without_allocator(__other); } @@ -653,7 +648,7 @@ template class _Lay template _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) { - __alloc_rr& __a = __get_allocator(); + allocator_type& __a = __get_allocator(); for (; __first != __last; ++__first) { if (__back_spare() == 0) { size_type __old_cap = capacity(); @@ -718,7 +713,7 @@ __split_buffer<_Tp, _Allocator, _Layout>::__destruct_at_end(pointer __new_last, template class _Layout> _LIBCPP_CONSTEXPR_SINCE_CXX20 -__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(size_type __cap, size_type __start, __alloc_rr& __a) +__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(size_type __cap, size_type __start, allocator_type& __a) : __base_type(__a) { _LIBCPP_ASSERT_INTERNAL(__cap >= __start, "can't have a start point outside the capacity"); if (__cap > 0) { @@ -748,7 +743,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator, _Layout>::__split_ template class _Layout> _LIBCPP_CONSTEXPR_SINCE_CXX20 -__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(__split_buffer&& __c, const __alloc_rr& __a) +__split_buffer<_Tp, _Allocator, _Layout>::__split_buffer(__split_buffer&& __c, const allocator_type& __a) : __base_type(__a) { if (__a == __c.__get_allocator()) { __set_data(__c.__front_cap()); @@ -781,7 +776,7 @@ __split_buffer<_Tp, _Allocator, _Layout>::operator=(__split_buffer&& __c) template class _Layout> _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::swap(__split_buffer& __x) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>) { + _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v) { __base_type::swap(__x); } @@ -791,7 +786,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::shr #if _LIBCPP_HAS_EXCEPTIONS try { #endif // _LIBCPP_HAS_EXCEPTIONS - __split_buffer __t(size(), 0, __get_allocator()); + __split_buffer __t(size(), 0, __get_allocator()); if (__t.capacity() < capacity()) { __t.__construct_at_end(move_iterator(begin()), move_iterator(end())); __t.__set_sentinel(size()); @@ -818,7 +813,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::emp __set_valid_range(std::move_backward(begin(), __end, __new_end), __new_end); } else { size_type __c = std::max(2 * capacity(), 1); - __split_buffer __t(__c, (__c + 3) / 4, __get_allocator()); + __split_buffer __t(__c, (__c + 3) / 4, __get_allocator()); __t.__construct_at_end(move_iterator(begin()), move_iterator(__end)); __base_type::__swap_without_allocator(__t); } @@ -840,7 +835,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator, _Layout>::emp __set_valid_range(begin() - __d, __end); } else { size_type __c = std::max(2 * capacity(), 1); - __split_buffer __t(__c, __c / 4, __get_allocator()); + __split_buffer __t(__c, __c / 4, __get_allocator()); __t.__construct_at_end(move_iterator(begin()), move_iterator(__end)); __base_type::__swap_without_allocator(__t); } diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h index 4961a5fcb2067..93358d863492e 100644 --- a/libcxx/include/__vector/vector.h +++ b/libcxx/include/__vector/vector.h @@ -687,9 +687,9 @@ class vector { } _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __swap_out_circular_buffer(__split_buffer& __v); + __swap_out_circular_buffer(__split_buffer& __v); _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer - __swap_out_circular_buffer(__split_buffer& __v, pointer __p); + __swap_out_circular_buffer(__split_buffer& __v, pointer __p); _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_range(pointer __from_s, pointer __from_e, pointer __to); _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, true_type) @@ -810,7 +810,7 @@ class vector { return __p; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(__split_buffer<_Tp, allocator_type&>& __sb) { + _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(__split_buffer<_Tp, allocator_type>& __sb) { auto __vector_begin = __begin_; auto __vector_sentinel = __end_; auto __vector_cap = __cap_; @@ -855,7 +855,7 @@ vector(from_range_t, _Range&&, _Alloc = _Alloc()) -> vector _LIBCPP_CONSTEXPR_SINCE_CXX20 void -vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v) { +vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v) { __annotate_delete(); auto __new_begin = __v.begin() - size(); std::__uninitialized_allocator_relocate( @@ -874,7 +874,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer -vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v, pointer __p) { +vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v, pointer __p) { __annotate_delete(); pointer __ret = __v.begin(); @@ -1074,7 +1074,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::reserve(size_type __ if (__n > capacity()) { if (__n > max_size()) this->__throw_length_error(); - __split_buffer __v(__n, size(), this->__alloc_); + __split_buffer __v(__n, size(), this->__alloc_); __swap_out_circular_buffer(__v); } } @@ -1085,7 +1085,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::shrink_to_fit() _NOE #if _LIBCPP_HAS_EXCEPTIONS try { #endif // _LIBCPP_HAS_EXCEPTIONS - __split_buffer __v(size(), size(), this->__alloc_); + __split_buffer __v(size(), size(), this->__alloc_); // The Standard mandates shrink_to_fit() does not increase the capacity. // With equal capacity keep the existing buffer. This avoids extra work // due to swapping the elements. @@ -1102,7 +1102,7 @@ template template _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) { - __split_buffer __v(__recommend(size() + 1), size(), this->__alloc_); + __split_buffer __v(__recommend(size() + 1), size(), this->__alloc_); // __v.emplace_back(std::forward<_Args>(__args)...); pointer __end = __v.end(); __alloc_traits::construct(this->__alloc_, std::__to_address(__end), std::forward<_Args>(__args)...); @@ -1205,7 +1205,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x) *__p = *__xr; } } else { - __split_buffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_); + __split_buffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_); __v.emplace_back(__x); __p = __swap_out_circular_buffer(__v, __p); } @@ -1224,7 +1224,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, value_type&& __x) { *__p = std::move(__x); } } else { - __split_buffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_); + __split_buffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_); __v.emplace_back(std::move(__x)); __p = __swap_out_circular_buffer(__v, __p); } @@ -1245,7 +1245,7 @@ vector<_Tp, _Allocator>::emplace(const_iterator __position, _Args&&... __args) { *__p = std::move(__tmp.get()); } } else { - __split_buffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_); + __split_buffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_); __v.emplace_back(std::forward<_Args>(__args)...); __p = __swap_out_circular_buffer(__v, __p); } @@ -1273,7 +1273,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, size_type __n, const_ std::fill_n(__p, __n, *__xr); } } else { - __split_buffer __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_); + __split_buffer __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_); __v.__construct_at_end(__n, __x); __p = __swap_out_circular_buffer(__v, __p); } @@ -1294,11 +1294,11 @@ vector<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __position, _Inpu if (__first == __last) (void)std::rotate(__p, __old_last, this->__end_); else { - __split_buffer __v(__alloc_); + __split_buffer __v(__alloc_); auto __guard = std::__make_exception_guard( _AllocatorDestroyRangeReverse(__alloc_, __old_last, this->__end_)); __v.__construct_at_end_with_sentinel(std::move(__first), std::move(__last)); - __split_buffer __merged( + __split_buffer __merged( __recommend(size() + __v.size()), __off, __alloc_); // has `__off` positions available at the front std::__uninitialized_allocator_relocate( __alloc_, std::__to_address(__old_last), std::__to_address(this->__end_), std::__to_address(__merged.end())); @@ -1344,7 +1344,7 @@ vector<_Tp, _Allocator>::__insert_with_size( __insert_assign_n_unchecked<_AlgPolicy>(std::move(__first), __n, __p); } } else { - __split_buffer __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_); + __split_buffer __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_); __v.__construct_at_end_with_size(std::move(__first), __n); __p = __swap_out_circular_buffer(__v, __p); } @@ -1359,7 +1359,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __n if (__new_size <= capacity()) { __construct_at_end(__new_size - __current_size); } else { - __split_buffer __v(__recommend(__new_size), __current_size, __alloc_); + __split_buffer __v(__recommend(__new_size), __current_size, __alloc_); __v.__construct_at_end(__new_size - __current_size); __swap_out_circular_buffer(__v); } @@ -1375,7 +1375,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __n if (__new_size <= capacity()) __construct_at_end(__new_size - __current_size, __x); else { - __split_buffer __v(__recommend(__new_size), __current_size, __alloc_); + __split_buffer __v(__recommend(__new_size), __current_size, __alloc_); __v.__construct_at_end(__new_size - __current_size, __x); __swap_out_circular_buffer(__v); } diff --git a/libcxx/include/deque b/libcxx/include/deque index ad2d759e1fcac..befe27bb4282d 100644 --- a/libcxx/include/deque +++ b/libcxx/include/deque @@ -1785,9 +1785,9 @@ template template _LIBCPP_HIDE_FROM_ABI typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Sentinel __l) { - __split_buffer __buf(__alloc()); + __split_buffer __buf(__alloc()); __buf.__construct_at_end_with_sentinel(std::move(__f), std::move(__l)); - typedef typename __split_buffer::iterator __bi; + typedef typename __split_buffer::iterator __bi; return insert(__p, move_iterator<__bi>(__buf.begin()), move_iterator<__bi>(__buf.end())); } @@ -1802,9 +1802,9 @@ template template _LIBCPP_HIDE_FROM_ABI typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::__insert_with_size(const_iterator __p, _Iterator __f, size_type __n) { - __split_buffer __buf(__n, 0, __alloc()); + __split_buffer __buf(__n, 0, __alloc()); __buf.__construct_at_end_with_size(__f, __n); - typedef typename __split_buffer::iterator __fwd; + typedef typename __split_buffer::iterator __fwd; return insert(__p, move_iterator<__fwd>(__buf.begin()), move_iterator<__fwd>(__buf.end())); } @@ -1982,7 +1982,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity() { } // Else need to allocate 1 buffer, *and* we need to reallocate __map_. else { - __split_buffer __buf( + __split_buffer __buf( std::max(2 * __map_.capacity(), 1), 0, __map_.__get_allocator()); typedef __allocator_destructor<_Allocator> _Dp; @@ -2042,7 +2042,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) { // Else need to allocate __nb buffers, *and* we need to reallocate __map_. else { size_type __ds = (__nb + __back_capacity) * __block_size - __map_.empty(); - __split_buffer __buf( + __split_buffer __buf( std::max(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__get_allocator()); auto __guard = std::__make_exception_guard([&] { __annotate_delete(); @@ -2094,7 +2094,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity() { } // Else need to allocate 1 buffer, *and* we need to reallocate __map_. else { - __split_buffer __buf( + __split_buffer __buf( std::max(2 * __map_.capacity(), 1), __map_.size(), __map_.__get_allocator()); typedef __allocator_destructor<_Allocator> _Dp; @@ -2154,7 +2154,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) { // Else need to allocate __nb buffers, *and* we need to reallocate __map_. else { size_type __ds = __front_capacity * __block_size; - __split_buffer __buf( + __split_buffer __buf( std::max(2 * __map_.capacity(), __nb + __map_.size()), __map_.size() - __front_capacity, __map_.__get_allocator()); From db06ebbb9cdf2ff32e1f6900e441cefce23d4624 Mon Sep 17 00:00:00 2001 From: Philip Ginsbach-Chen Date: Thu, 11 Dec 2025 08:42:19 +0000 Subject: [PATCH 32/49] [AArch64][NFC] Add isTRNMask improvements to isZIPMask (#171532) Some [ideas for improvement](https://github.com/llvm/llvm-project/pull/169858#pullrequestreview-3525357470) came up during review of recent changes to `isTRNMask`. This PR applies them also to `isZIPMask`, which is implemented almost identically. --- .../Target/AArch64/AArch64ISelLowering.cpp | 24 ++++++------- .../Target/AArch64/AArch64PerfectShuffle.h | 35 ++++++++++--------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3012343386c07..41caa817c11a4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -31797,12 +31797,12 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( unsigned OperandOrder; if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult, OperandOrder) && - WhichResult == 0) - return convertFromScalableVector( - DAG, VT, - DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, - OperandOrder == 0 ? Op1 : Op2, - OperandOrder == 0 ? Op2 : Op1)); + WhichResult == 0) { + SDValue ZIP = DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, + OperandOrder == 0 ? Op1 : Op2, + OperandOrder == 0 ? Op2 : Op1); + return convertFromScalableVector(DAG, VT, ZIP); + } if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult, OperandOrder)) { @@ -31852,12 +31852,12 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult, OperandOrder) && - WhichResult != 0) - return convertFromScalableVector( - DAG, VT, - DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, - OperandOrder == 0 ? Op1 : Op2, - OperandOrder == 0 ? Op2 : Op1)); + WhichResult != 0) { + SDValue ZIP = DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, + OperandOrder == 0 ? Op1 : Op2, + OperandOrder == 0 ? Op2 : Op1); + return convertFromScalableVector(DAG, VT, ZIP); + } if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index c7d6b31291197..12a53aad08aa8 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6631,43 +6631,44 @@ inline bool isZIPMask(ArrayRef M, unsigned NumElts, if (NumElts % 2 != 0) return false; - // "Variant" refers to the distinction bwetween zip1 and zip2, while - // "Order" refers to sequence of input registers (matching vs flipped). - bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0 - bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0 - bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1 - bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1 + // "Result" corresponds to "WhichResultOut", selecting between zip1 and zip2. + // "Order" corresponds to "OperandOrderOut", selecting the order of operands + // for the instruction (flipped or not). + bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0 + bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0 + bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1 + bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1 // Check all elements match. for (unsigned i = 0; i != NumElts; i += 2) { if (M[i] >= 0) { unsigned EvenElt = (unsigned)M[i]; if (EvenElt != i / 2) - Variant0Order0 = false; + Result0Order0 = false; if (EvenElt != NumElts / 2 + i / 2) - Variant1Order0 = false; + Result1Order0 = false; if (EvenElt != NumElts + i / 2) - Variant0Order1 = false; + Result0Order1 = false; if (EvenElt != NumElts + NumElts / 2 + i / 2) - Variant1Order1 = false; + Result1Order1 = false; } if (M[i + 1] >= 0) { unsigned OddElt = (unsigned)M[i + 1]; if (OddElt != NumElts + i / 2) - Variant0Order0 = false; + Result0Order0 = false; if (OddElt != NumElts + NumElts / 2 + i / 2) - Variant1Order0 = false; + Result1Order0 = false; if (OddElt != i / 2) - Variant0Order1 = false; + Result0Order1 = false; if (OddElt != NumElts / 2 + i / 2) - Variant1Order1 = false; + Result1Order1 = false; } } - if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1) + if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1) return false; - WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1; - OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1; + WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1; + OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1; return true; } From 59b13d6d9cdc5847af0b26732902a7d4af4707d3 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 11 Dec 2025 09:46:37 +0100 Subject: [PATCH 33/49] [libc++] Add `__find_end` optimizations back (#171374) This essentially reverts #100685 and fixes the bidirectional and random access specializations to be actually used. ``` Benchmark old new Difference % Difference ------------------------------------------------------------ -------------- -------------- ------------ -------------- rng::find_end(deque)_(match_near_end)/1000 366.91 47.63 -319.28 -87.02% rng::find_end(deque)_(match_near_end)/1024 3273.31 35.42 -3237.89 -98.92% rng::find_end(deque)_(match_near_end)/8192 171608.41 285.04 -171323.38 -99.83% rng::find_end(deque)_(near_matches)/1000 31808.40 19214.35 -12594.05 -39.59% rng::find_end(deque)_(near_matches)/1024 37428.72 20773.87 -16654.85 -44.50% rng::find_end(deque)_(near_matches)/8192 1719468.34 1213967.45 -505500.89 -29.40% rng::find_end(deque)_(process_all)/1000 275.81 336.29 60.49 21.93% rng::find_end(deque)_(process_all)/1024 258.88 320.36 61.47 23.74% rng::find_end(deque)_(process_all)/1048576 277117.41 327640.37 50522.96 18.23% rng::find_end(deque)_(process_all)/8192 2166.36 2533.52 367.16 16.95% rng::find_end(deque)_(same_length)/1000 1280.06 362.53 -917.53 -71.68% rng::find_end(deque)_(same_length)/1024 1419.99 417.58 -1002.40 -70.59% rng::find_end(deque)_(same_length)/8192 11363.81 2870.63 -8493.18 -74.74% rng::find_end(deque)_(single_element)/1000 277.22 363.52 86.31 31.13% rng::find_end(deque)_(single_element)/1024 257.11 353.94 96.84 37.66% rng::find_end(deque)_(single_element)/8192 2059.02 2762.29 703.27 34.16% rng::find_end(deque,_pred)_(match_near_end)/1000 696.84 70.07 -626.77 -89.94% rng::find_end(deque,_pred)_(match_near_end)/1024 4774.82 70.75 -4704.07 -98.52% rng::find_end(deque,_pred)_(match_near_end)/8192 267492.37 549.57 -266942.81 -99.79% rng::find_end(deque,_pred)_(near_matches)/1000 39414.88 31070.43 -8344.46 -21.17% rng::find_end(deque,_pred)_(near_matches)/1024 38168.52 32362.18 -5806.34 -15.21% rng::find_end(deque,_pred)_(near_matches)/8192 2594717.16 1938056.79 -656660.38 -25.31% rng::find_end(deque,_pred)_(process_all)/1000 600.88 586.92 -13.96 -2.32% rng::find_end(deque,_pred)_(process_all)/1024 613.00 592.66 -20.33 -3.32% rng::find_end(deque,_pred)_(process_all)/1048576 600059.65 603440.98 3381.33 0.56% rng::find_end(deque,_pred)_(process_all)/8192 4850.32 4764.56 -85.76 -1.77% rng::find_end(deque,_pred)_(same_length)/1000 1514.90 700.34 -814.57 -53.77% rng::find_end(deque,_pred)_(same_length)/1024 1561.14 705.80 -855.34 -54.79% rng::find_end(deque,_pred)_(same_length)/8192 12544.84 5024.45 -7520.39 -59.95% rng::find_end(deque,_pred)_(single_element)/1000 603.79 650.63 46.84 7.76% rng::find_end(deque,_pred)_(single_element)/1024 614.93 656.43 41.50 6.75% rng::find_end(deque,_pred)_(single_element)/8192 4885.89 5225.71 339.82 6.96% rng::find_end(forward_list)_(match_near_end)/1000 770.05 769.32 -0.73 -0.09% rng::find_end(forward_list)_(match_near_end)/1024 4833.13 4733.24 -99.90 -2.07% rng::find_end(forward_list)_(match_near_end)/8192 259324.32 261066.84 1742.52 0.67% rng::find_end(forward_list)_(near_matches)/1000 38301.11 38608.61 307.50 0.80% rng::find_end(forward_list)_(near_matches)/1024 39370.54 39878.59 508.05 1.29% rng::find_end(forward_list)_(near_matches)/8192 2527338.50 2527722.47 383.97 0.02% rng::find_end(forward_list)_(process_all)/1000 713.63 720.74 7.11 1.00% rng::find_end(forward_list)_(process_all)/1024 727.81 731.60 3.79 0.52% rng::find_end(forward_list)_(process_all)/1048576 757728.47 766470.14 8741.67 1.15% rng::find_end(forward_list)_(process_all)/8192 5821.05 5817.80 -3.25 -0.06% rng::find_end(forward_list)_(same_length)/1000 1458.99 1454.50 -4.49 -0.31% rng::find_end(forward_list)_(same_length)/1024 1507.73 1515.78 8.05 0.53% rng::find_end(forward_list)_(same_length)/8192 20432.32 18658.93 -1773.39 -8.68% rng::find_end(forward_list)_(single_element)/1000 712.41 708.41 -4.00 -0.56% rng::find_end(forward_list)_(single_element)/1024 728.05 728.78 0.73 0.10% rng::find_end(forward_list)_(single_element)/8192 5795.48 6332.88 537.40 9.27% rng::find_end(forward_list,_pred)_(match_near_end)/1000 843.67 846.77 3.10 0.37% rng::find_end(forward_list,_pred)_(match_near_end)/1024 5267.90 5343.84 75.94 1.44% rng::find_end(forward_list,_pred)_(match_near_end)/8192 280912.75 286141.10 5228.35 1.86% rng::find_end(forward_list,_pred)_(near_matches)/1000 43386.35 44489.38 1103.03 2.54% rng::find_end(forward_list,_pred)_(near_matches)/1024 44929.84 45608.55 678.71 1.51% rng::find_end(forward_list,_pred)_(near_matches)/8192 2723281.29 2765369.43 42088.14 1.55% rng::find_end(forward_list,_pred)_(process_all)/1000 763.13 763.85 0.72 0.09% rng::find_end(forward_list,_pred)_(process_all)/1024 796.98 773.40 -23.58 -2.96% rng::find_end(forward_list,_pred)_(process_all)/1048576 858071.76 846166.06 -11905.69 -1.39% rng::find_end(forward_list,_pred)_(process_all)/8192 6282.19 6244.95 -37.24 -0.59% rng::find_end(forward_list,_pred)_(same_length)/1000 1560.18 1583.03 22.86 1.47% rng::find_end(forward_list,_pred)_(same_length)/1024 1603.94 1612.22 8.28 0.52% rng::find_end(forward_list,_pred)_(same_length)/8192 16907.98 15638.35 -1269.63 -7.51% rng::find_end(forward_list,_pred)_(single_element)/1000 746.72 754.08 7.36 0.99% rng::find_end(forward_list,_pred)_(single_element)/1024 761.27 771.75 10.48 1.38% rng::find_end(forward_list,_pred)_(single_element)/8192 6166.83 6687.87 521.04 8.45% rng::find_end(list)_(match_near_end)/1000 793.99 67.06 -726.93 -91.55% rng::find_end(list)_(match_near_end)/1024 4682.12 79.82 -4602.31 -98.30% rng::find_end(list)_(match_near_end)/8192 263187.10 582.64 -262604.46 -99.78% rng::find_end(list)_(near_matches)/1000 38066.70 34687.59 -3379.11 -8.88% rng::find_end(list)_(near_matches)/1024 39721.77 36150.04 -3571.73 -8.99% rng::find_end(list)_(near_matches)/8192 2543369.85 2247297.03 -296072.82 -11.64% rng::find_end(list)_(process_all)/1000 716.89 726.65 9.76 1.36% rng::find_end(list)_(process_all)/1024 742.41 744.05 1.64 0.22% rng::find_end(list)_(process_all)/1048576 822449.08 873801.46 51352.38 6.24% rng::find_end(list)_(process_all)/8192 7704.49 9766.50 2062.02 26.76% rng::find_end(list)_(same_length)/1000 1508.19 710.90 -797.28 -52.86% rng::find_end(list)_(same_length)/1024 1540.23 735.35 -804.88 -52.26% rng::find_end(list)_(same_length)/8192 22786.44 10752.45 -12033.98 -52.81% rng::find_end(list)_(single_element)/1000 699.16 734.76 35.60 5.09% rng::find_end(list)_(single_element)/1024 717.09 750.91 33.82 4.72% rng::find_end(list)_(single_element)/8192 9502.45 10289.21 786.76 8.28% rng::find_end(list,_pred)_(match_near_end)/1000 841.98 83.86 -758.12 -90.04% rng::find_end(list,_pred)_(match_near_end)/1024 5463.71 76.95 -5386.76 -98.59% rng::find_end(list,_pred)_(match_near_end)/8192 287070.76 647.14 -286423.62 -99.77% rng::find_end(list,_pred)_(near_matches)/1000 43878.61 38899.00 -4979.61 -11.35% rng::find_end(list,_pred)_(near_matches)/1024 45672.50 40520.68 -5151.82 -11.28% rng::find_end(list,_pred)_(near_matches)/8192 2764800.76 2495879.89 -268920.87 -9.73% rng::find_end(list,_pred)_(process_all)/1000 764.46 774.78 10.32 1.35% rng::find_end(list,_pred)_(process_all)/1024 786.81 793.05 6.24 0.79% rng::find_end(list,_pred)_(process_all)/1048576 934166.34 954637.60 20471.26 2.19% rng::find_end(list,_pred)_(process_all)/8192 9509.24 10209.73 700.49 7.37% rng::find_end(list,_pred)_(same_length)/1000 1545.67 782.96 -762.71 -49.34% rng::find_end(list,_pred)_(same_length)/1024 1580.94 796.87 -784.08 -49.60% rng::find_end(list,_pred)_(same_length)/8192 21558.41 13370.92 -8187.49 -37.98% rng::find_end(list,_pred)_(single_element)/1000 766.49 762.81 -3.68 -0.48% rng::find_end(list,_pred)_(single_element)/1024 784.75 781.47 -3.28 -0.42% rng::find_end(list,_pred)_(single_element)/8192 9722.26 10399.11 676.85 6.96% rng::find_end(vector)_(match_near_end)/1000 267.82 25.34 -242.48 -90.54% rng::find_end(vector)_(match_near_end)/1024 2259.46 25.78 -2233.68 -98.86% rng::find_end(vector)_(match_near_end)/8192 119747.92 214.53 -119533.39 -99.82% rng::find_end(vector)_(near_matches)/1000 16913.73 14102.20 -2811.53 -16.62% rng::find_end(vector)_(near_matches)/1024 16097.97 14767.26 -1330.71 -8.27% rng::find_end(vector)_(near_matches)/8192 1102803.07 823463.30 -279339.78 -25.33% rng::find_end(vector)_(process_all)/1000 233.43 380.28 146.85 62.91% rng::find_end(vector)_(process_all)/1024 238.86 389.32 150.46 62.99% rng::find_end(vector)_(process_all)/1048576 269619.36 391698.75 122079.39 45.28% rng::find_end(vector)_(process_all)/8192 2011.46 3061.40 1049.94 52.20% rng::find_end(vector)_(same_length)/1000 632.19 253.50 -378.69 -59.90% rng::find_end(vector)_(same_length)/1024 556.53 254.87 -301.66 -54.20% rng::find_end(vector)_(same_length)/8192 4597.26 2095.57 -2501.68 -54.42% rng::find_end(vector)_(single_element)/1000 231.57 417.64 186.06 80.35% rng::find_end(vector)_(single_element)/1024 236.41 427.03 190.62 80.63% rng::find_end(vector)_(single_element)/8192 1918.95 3367.29 1448.33 75.48% rng::find_end(vector,_pred)_(match_near_end)/1000 581.49 52.67 -528.82 -90.94% rng::find_end(vector,_pred)_(match_near_end)/1024 3545.40 53.74 -3491.65 -98.48% rng::find_end(vector,_pred)_(match_near_end)/8192 190482.78 432.30 -190050.48 -99.77% rng::find_end(vector,_pred)_(near_matches)/1000 28878.24 24723.01 -4155.23 -14.39% rng::find_end(vector,_pred)_(near_matches)/1024 30035.85 25597.45 -4438.40 -14.78% rng::find_end(vector,_pred)_(near_matches)/8192 1858596.45 1584796.11 -273800.34 -14.73% rng::find_end(vector,_pred)_(process_all)/1000 518.92 813.46 294.53 56.76% rng::find_end(vector,_pred)_(process_all)/1024 531.17 710.20 179.03 33.70% rng::find_end(vector,_pred)_(process_all)/1048576 674064.13 905070.15 231006.01 34.27% rng::find_end(vector,_pred)_(process_all)/8192 4254.34 6372.76 2118.43 49.79% rng::find_end(vector,_pred)_(same_length)/1000 1106.96 526.23 -580.73 -52.46% rng::find_end(vector,_pred)_(same_length)/1024 1133.60 539.70 -593.90 -52.39% rng::find_end(vector,_pred)_(same_length)/8192 8988.10 4302.83 -4685.27 -52.13% rng::find_end(vector,_pred)_(single_element)/1000 528.11 523.69 -4.42 -0.84% rng::find_end(vector,_pred)_(single_element)/1024 539.58 838.49 298.91 55.40% rng::find_end(vector,_pred)_(single_element)/8192 4301.43 7313.22 3011.79 70.02% std::find_end(deque)_(match_near_end)/1000 347.82 38.56 -309.26 -88.91% std::find_end(deque)_(match_near_end)/1024 3340.80 34.54 -3306.27 -98.97% std::find_end(deque)_(match_near_end)/8192 171599.83 281.87 -171317.96 -99.84% std::find_end(deque)_(near_matches)/1000 29703.68 19712.27 -9991.41 -33.64% std::find_end(deque)_(near_matches)/1024 32312.41 20008.21 -12304.20 -38.08% std::find_end(deque)_(near_matches)/8192 1851286.99 1216112.34 -635174.65 -34.31% std::find_end(deque)_(process_all)/1000 256.69 315.96 59.27 23.09% std::find_end(deque)_(process_all)/1024 260.97 305.42 44.45 17.03% std::find_end(deque)_(process_all)/1048576 273310.08 309499.13 36189.05 13.24% std::find_end(deque)_(process_all)/8192 2071.33 2606.57 535.25 25.84% std::find_end(deque)_(same_length)/1000 1422.58 441.07 -981.51 -68.99% std::find_end(deque)_(same_length)/1024 1844.27 350.75 -1493.52 -80.98% std::find_end(deque)_(same_length)/8192 14681.69 2839.26 -11842.43 -80.66% std::find_end(deque)_(single_element)/1000 291.63 344.82 53.19 18.24% std::find_end(deque)_(single_element)/1024 257.97 330.19 72.21 27.99% std::find_end(deque)_(single_element)/8192 2220.10 2505.02 284.92 12.83% std::find_end(deque,_pred)_(match_near_end)/1000 694.70 69.60 -625.11 -89.98% std::find_end(deque,_pred)_(match_near_end)/1024 4735.45 71.12 -4664.33 -98.50% std::find_end(deque,_pred)_(match_near_end)/8192 267417.02 561.03 -266855.99 -99.79% std::find_end(deque,_pred)_(near_matches)/1000 42199.71 31597.49 -10602.22 -25.12% std::find_end(deque,_pred)_(near_matches)/1024 38007.49 32362.16 -5645.33 -14.85% std::find_end(deque,_pred)_(near_matches)/8192 2607708.49 1935799.88 -671908.60 -25.77% std::find_end(deque,_pred)_(process_all)/1000 599.65 552.71 -46.94 -7.83% std::find_end(deque,_pred)_(process_all)/1024 615.88 554.17 -61.71 -10.02% std::find_end(deque,_pred)_(process_all)/1048576 598471.63 599441.79 970.16 0.16% std::find_end(deque,_pred)_(process_all)/8192 4853.45 4394.20 -459.25 -9.46% std::find_end(deque,_pred)_(same_length)/1000 1511.68 797.64 -714.04 -47.23% std::find_end(deque,_pred)_(same_length)/1024 1568.63 810.85 -757.78 -48.31% std::find_end(deque,_pred)_(same_length)/8192 12609.34 5092.02 -7517.32 -59.62% std::find_end(deque,_pred)_(single_element)/1000 601.22 628.80 27.58 4.59% std::find_end(deque,_pred)_(single_element)/1024 613.25 627.15 13.89 2.27% std::find_end(deque,_pred)_(single_element)/8192 4823.85 4795.25 -28.60 -0.59% std::find_end(forward_list)_(match_near_end)/1000 762.64 769.74 7.10 0.93% std::find_end(forward_list)_(match_near_end)/1024 4767.93 4840.87 72.94 1.53% std::find_end(forward_list)_(match_near_end)/8192 260275.68 260835.21 559.53 0.21% std::find_end(forward_list)_(near_matches)/1000 38020.76 38197.53 176.77 0.46% std::find_end(forward_list)_(near_matches)/1024 39028.86 39333.38 304.51 0.78% std::find_end(forward_list)_(near_matches)/8192 2524921.48 2523470.32 -1451.16 -0.06% std::find_end(forward_list)_(process_all)/1000 699.95 699.93 -0.02 -0.00% std::find_end(forward_list)_(process_all)/1024 715.24 712.07 -3.17 -0.44% std::find_end(forward_list)_(process_all)/1048576 755926.33 756976.31 1049.98 0.14% std::find_end(forward_list)_(process_all)/8192 5696.72 5672.92 -23.81 -0.42% std::find_end(forward_list)_(same_length)/1000 1485.84 1480.19 -5.65 -0.38% std::find_end(forward_list)_(same_length)/1024 1493.62 1516.95 23.33 1.56% std::find_end(forward_list)_(same_length)/8192 16833.75 13551.42 -3282.33 -19.50% std::find_end(forward_list)_(single_element)/1000 688.87 675.02 -13.85 -2.01% std::find_end(forward_list)_(single_element)/1024 688.89 691.59 2.69 0.39% std::find_end(forward_list)_(single_element)/8192 5735.87 6748.85 1012.98 17.66% std::find_end(forward_list,_pred)_(match_near_end)/1000 836.01 853.28 17.27 2.07% std::find_end(forward_list,_pred)_(match_near_end)/1024 5259.92 5299.30 39.39 0.75% std::find_end(forward_list,_pred)_(match_near_end)/8192 279479.85 285593.49 6113.65 2.19% std::find_end(forward_list,_pred)_(near_matches)/1000 42577.60 44550.54 1972.94 4.63% std::find_end(forward_list,_pred)_(near_matches)/1024 44374.19 45697.95 1323.76 2.98% std::find_end(forward_list,_pred)_(near_matches)/8192 2711138.03 2742988.33 31850.30 1.17% std::find_end(forward_list,_pred)_(process_all)/1000 752.03 762.75 10.72 1.43% std::find_end(forward_list,_pred)_(process_all)/1024 767.04 781.48 14.44 1.88% std::find_end(forward_list,_pred)_(process_all)/1048576 843453.35 861838.82 18385.47 2.18% std::find_end(forward_list,_pred)_(process_all)/8192 6241.65 6308.05 66.40 1.06% std::find_end(forward_list,_pred)_(same_length)/1000 2384.18 1589.21 -794.97 -33.34% std::find_end(forward_list,_pred)_(same_length)/1024 2428.97 1617.17 -811.80 -33.42% std::find_end(forward_list,_pred)_(same_length)/8192 16961.22 14972.86 -1988.36 -11.72% std::find_end(forward_list,_pred)_(single_element)/1000 743.31 752.77 9.47 1.27% std::find_end(forward_list,_pred)_(single_element)/1024 763.62 768.70 5.08 0.67% std::find_end(forward_list,_pred)_(single_element)/8192 6189.73 6934.04 744.31 12.02% std::find_end(list)_(match_near_end)/1000 773.76 76.41 -697.35 -90.12% std::find_end(list)_(match_near_end)/1024 4715.36 69.09 -4646.27 -98.53% std::find_end(list)_(match_near_end)/8192 264864.51 584.19 -264280.32 -99.78% std::find_end(list)_(near_matches)/1000 37650.69 35233.45 -2417.24 -6.42% std::find_end(list)_(near_matches)/1024 39239.25 36699.13 -2540.13 -6.47% std::find_end(list)_(near_matches)/8192 2543446.71 2252625.27 -290821.44 -11.43% std::find_end(list)_(process_all)/1000 718.00 724.59 6.59 0.92% std::find_end(list)_(process_all)/1024 735.14 746.70 11.57 1.57% std::find_end(list)_(process_all)/1048576 812620.48 869606.78 56986.30 7.01% std::find_end(list)_(process_all)/8192 8217.98 8462.53 244.55 2.98% std::find_end(list)_(same_length)/1000 1500.85 716.45 -784.39 -52.26% std::find_end(list)_(same_length)/1024 1534.13 736.62 -797.51 -51.98% std::find_end(list)_(same_length)/8192 20274.06 10621.82 -9652.24 -47.61% std::find_end(list)_(single_element)/1000 717.05 725.64 8.60 1.20% std::find_end(list)_(single_element)/1024 732.87 742.44 9.57 1.31% std::find_end(list)_(single_element)/8192 9835.11 11896.39 2061.28 20.96% std::find_end(list,_pred)_(match_near_end)/1000 845.46 75.09 -770.37 -91.12% std::find_end(list,_pred)_(match_near_end)/1024 5301.60 77.14 -5224.46 -98.54% std::find_end(list,_pred)_(match_near_end)/8192 281976.13 648.87 -281327.25 -99.77% std::find_end(list,_pred)_(near_matches)/1000 44076.98 39576.32 -4500.67 -10.21% std::find_end(list,_pred)_(near_matches)/1024 45531.64 41020.11 -4511.54 -9.91% std::find_end(list,_pred)_(near_matches)/8192 2756383.66 2503085.29 -253298.37 -9.19% std::find_end(list,_pred)_(process_all)/1000 766.06 764.48 -1.58 -0.21% std::find_end(list,_pred)_(process_all)/1024 780.35 799.51 19.15 2.45% std::find_end(list,_pred)_(process_all)/1048576 894643.71 898947.94 4304.24 0.48% std::find_end(list,_pred)_(process_all)/8192 8436.41 9977.74 1541.33 18.27% std::find_end(list,_pred)_(same_length)/1000 1545.22 784.29 -760.92 -49.24% std::find_end(list,_pred)_(same_length)/1024 1583.27 808.52 -774.74 -48.93% std::find_end(list,_pred)_(same_length)/8192 21850.99 10896.50 -10954.48 -50.13% std::find_end(list,_pred)_(single_element)/1000 752.03 755.00 2.97 0.39% std::find_end(list,_pred)_(single_element)/1024 774.22 784.14 9.92 1.28% std::find_end(list,_pred)_(single_element)/8192 10219.43 10396.49 177.05 1.73% std::find_end(vector)_(match_near_end)/1000 277.37 28.45 -248.91 -89.74% std::find_end(vector)_(match_near_end)/1024 2247.56 25.80 -2221.76 -98.85% std::find_end(vector)_(match_near_end)/8192 119785.10 212.44 -119572.66 -99.82% std::find_end(vector)_(near_matches)/1000 16351.34 14073.13 -2278.21 -13.93% std::find_end(vector)_(near_matches)/1024 16656.33 14654.36 -2001.97 -12.02% std::find_end(vector)_(near_matches)/8192 1181392.88 828918.96 -352473.91 -29.84% std::find_end(vector)_(process_all)/1000 231.14 235.80 4.66 2.01% std::find_end(vector)_(process_all)/1024 235.87 232.06 -3.81 -1.61% std::find_end(vector)_(process_all)/1048576 239922.25 238229.38 -1692.87 -0.71% std::find_end(vector)_(process_all)/8192 1837.43 1802.25 -35.19 -1.91% std::find_end(vector)_(same_length)/1000 632.59 252.80 -379.79 -60.04% std::find_end(vector)_(same_length)/1024 524.51 257.58 -266.94 -50.89% std::find_end(vector)_(same_length)/8192 5159.01 2090.12 -3068.89 -59.49% std::find_end(vector)_(single_element)/1000 229.56 250.47 20.91 9.11% std::find_end(vector)_(single_element)/1024 234.86 252.18 17.32 7.37% std::find_end(vector)_(single_element)/8192 1825.74 1981.90 156.16 8.55% std::find_end(vector,_pred)_(match_near_end)/1000 574.17 52.98 -521.19 -90.77% std::find_end(vector,_pred)_(match_near_end)/1024 3525.35 54.03 -3471.32 -98.47% std::find_end(vector,_pred)_(match_near_end)/8192 190155.81 423.41 -189732.40 -99.78% std::find_end(vector,_pred)_(near_matches)/1000 28541.98 24598.37 -3943.61 -13.82% std::find_end(vector,_pred)_(near_matches)/1024 29696.55 25675.27 -4021.28 -13.54% std::find_end(vector,_pred)_(near_matches)/8192 1846970.41 1596191.84 -250778.57 -13.58% std::find_end(vector,_pred)_(process_all)/1000 519.71 592.14 72.43 13.94% std::find_end(vector,_pred)_(process_all)/1024 529.74 491.07 -38.67 -7.30% std::find_end(vector,_pred)_(process_all)/1048576 631923.41 643729.57 11806.16 1.87% std::find_end(vector,_pred)_(process_all)/8192 4215.05 3909.30 -305.75 -7.25% std::find_end(vector,_pred)_(same_length)/1000 1095.46 524.99 -570.47 -52.08% std::find_end(vector,_pred)_(same_length)/1024 1117.95 537.65 -580.31 -51.91% std::find_end(vector,_pred)_(same_length)/8192 8923.95 4307.13 -4616.83 -51.74% std::find_end(vector,_pred)_(single_element)/1000 516.52 656.32 139.80 27.07% std::find_end(vector,_pred)_(single_element)/1024 528.82 673.72 144.90 27.40% std::find_end(vector,_pred)_(single_element)/8192 4210.37 5529.52 1319.15 31.33% Geomean 6995.43 3440.97 -3554.46 -50.81% ``` --- libcxx/include/__algorithm/find_end.h | 105 ++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/libcxx/include/__algorithm/find_end.h b/libcxx/include/__algorithm/find_end.h index 86b4a3e2e3689..84b43e31a3a59 100644 --- a/libcxx/include/__algorithm/find_end.h +++ b/libcxx/include/__algorithm/find_end.h @@ -76,6 +76,111 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> } } +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter1> __find_end_impl( + _Iter1 __first1, + _Sent1 __sent1, + _Iter2 __first2, + _Sent2 __sent2, + _Pred& __pred, + _Proj1& __proj1, + _Proj2& __proj2, + bidirectional_iterator_tag, + bidirectional_iterator_tag) { + auto __last1 = _IterOps<_AlgPolicy>::next(__first1, __sent1); + auto __last2 = _IterOps<_AlgPolicy>::next(__first2, __sent2); + // modeled after search algorithm (in reverse) + if (__first2 == __last2) + return std::make_pair(__last1, __last1); // Everything matches an empty sequence + _Iter1 __l1 = __last1; + _Iter2 __l2 = __last2; + --__l2; + while (true) { + // Find last element in sequence 1 that matches *(__last2-1), with a mininum of loop checks + while (true) { + if (__first1 == __l1) // return __last1 if no element matches *__first2 + return std::make_pair(__last1, __last1); + if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2))) + break; + } + // *__l1 matches *__l2, now match elements before here + _Iter1 __match_last = __l1; + _Iter1 __m1 = __l1; + _Iter2 __m2 = __l2; + while (true) { + if (__m2 == __first2) // If pattern exhausted, __m1 is the answer (works for 1 element pattern) + return std::make_pair(__m1, ++__match_last); + if (__m1 == __first1) // Otherwise if source exhaused, pattern not found + return std::make_pair(__last1, __last1); + + // if there is a mismatch, restart with a new __l1 + if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(__proj2, *--__m2))) { + break; + } // else there is a match, check next elements + } + } +} + +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __find_end_impl( + _Iter1 __first1, + _Sent1 __sent1, + _Iter2 __first2, + _Sent2 __sent2, + _Pred& __pred, + _Proj1& __proj1, + _Proj2& __proj2, + random_access_iterator_tag, + random_access_iterator_tag) { + typedef typename iterator_traits<_Iter1>::difference_type _D1; + auto __last1 = _IterOps<_AlgPolicy>::next(__first1, __sent1); + auto __last2 = _IterOps<_AlgPolicy>::next(__first2, __sent2); + // Take advantage of knowing source and pattern lengths. Stop short when source is smaller than pattern + auto __len2 = __last2 - __first2; + if (__len2 == 0) + return std::make_pair(__last1, __last1); + auto __len1 = __last1 - __first1; + if (__len1 < __len2) + return std::make_pair(__last1, __last1); + const _Iter1 __s = __first1 + _D1(__len2 - 1); // End of pattern match can't go before here + _Iter1 __l1 = __last1; + _Iter2 __l2 = __last2; + --__l2; + while (true) { + while (true) { + if (__s == __l1) + return std::make_pair(__last1, __last1); + if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2))) + break; + } + _Iter1 __last_match = __l1; + _Iter1 __m1 = __l1; + _Iter2 __m2 = __l2; + while (true) { + if (__m2 == __first2) + return std::make_pair(__m1, ++__last_match); + // no need to check range on __m1 because __s guarantees we have enough source + if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(__proj2, *--__m2))) { + break; + } + } + } +} + template [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_end_classic( _ForwardIterator1 __first1, From 6573f62a88319e8f43ea369ff248cd9d54deaa14 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 11 Dec 2025 09:03:05 +0000 Subject: [PATCH 34/49] [X86] LowerATOMIC_STORE - on 32-bit targets see if i64 values were originally legal f64 values that we can store directly. (#171602) Based off feedback from #171478 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +- llvm/test/CodeGen/X86/atomic-fp.ll | 668 ++++++++---------------- 2 files changed, 212 insertions(+), 465 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3b3b20edbbe84..ec746843f8ea8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33171,7 +33171,14 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE // is enabled. if (VT == MVT::i64) { - if (Subtarget.hasSSE1()) { + SDValue BCValue = peekThroughBitcasts(Node->getVal()); + if (BCValue.getValueType() == MVT::f64 && + (Subtarget.hasX87() || Subtarget.hasSSE2())) { + // If the i64 was bitcast from a f64 then we can do the f64 atomic store + // directly with FSTPL/MOVSD. + Chain = DAG.getStore(Node->getChain(), dl, BCValue, Node->getBasePtr(), + Node->getMemOperand()); + } else if (Subtarget.hasSSE1()) { SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal()); MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll index fe79dfe39f645..2dee1d12e7255 100644 --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -80,23 +80,17 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: movl 8(%ebp), %eax ; X86-NOSSE-NEXT: fildll (%eax) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: faddl 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %ecx, (%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%eax) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: faddl 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -109,16 +103,13 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind { ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: movl 8(%ebp), %eax ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%eax) +; X86-SSE1-NEXT: fstpl (%eax) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -132,9 +123,7 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: movsd %xmm0, (%eax) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -148,9 +137,7 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %eax ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: vmovsd %xmm0, (%eax) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -246,22 +233,16 @@ define dso_local void @fadd_64g() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll glob64 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fld1 -; X86-NOSSE-NEXT: faddl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll glob64 +; X86-NOSSE-NEXT: fld1 +; X86-NOSSE-NEXT: faddl (%esp) +; X86-NOSSE-NEXT: fstpl glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -273,16 +254,13 @@ define dso_local void @fadd_64g() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl (%esp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, glob64 +; X86-SSE1-NEXT: fstpl glob64 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -295,9 +273,7 @@ define dso_local void @fadd_64g() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, glob64 +; X86-SSE2-NEXT: movsd %xmm0, glob64 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -310,9 +286,7 @@ define dso_local void @fadd_64g() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, glob64 +; X86-AVX-NEXT: vmovsd %xmm0, glob64 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -409,22 +383,16 @@ define dso_local void @fadd_64imm() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll -559038737 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fld1 -; X86-NOSSE-NEXT: faddl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll -559038737 +; X86-NOSSE-NEXT: fld1 +; X86-NOSSE-NEXT: faddl (%esp) +; X86-NOSSE-NEXT: fstpl -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -436,16 +404,13 @@ define dso_local void @fadd_64imm() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl (%esp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, -559038737 +; X86-SSE1-NEXT: fstpl -559038737 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -458,9 +423,7 @@ define dso_local void @fadd_64imm() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, -559038737 +; X86-SSE2-NEXT: movsd %xmm0, -559038737 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -473,9 +436,7 @@ define dso_local void @fadd_64imm() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, -559038737 +; X86-AVX-NEXT: vmovsd %xmm0, -559038737 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -577,22 +538,16 @@ define dso_local void @fadd_64stack() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: fildll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fld1 -; X86-NOSSE-NEXT: faddl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: faddl (%esp) ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -604,16 +559,13 @@ define dso_local void @fadd_64stack() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl (%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -627,8 +579,6 @@ define dso_local void @fadd_64stack() nounwind { ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -642,8 +592,6 @@ define dso_local void @fadd_64stack() nounwind { ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -677,7 +625,7 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: movl 20(%ebp), %eax ; X86-NOSSE-NEXT: movl 8(%ebp), %ecx ; X86-NOSSE-NEXT: fildll (%ecx,%eax,8) @@ -685,16 +633,10 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: faddl 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %edx, (%esp) -; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: faddl 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %ebp @@ -709,16 +651,13 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1-NEXT: movl 20(%ebp), %eax ; X86-SSE1-NEXT: movl 8(%ebp), %ecx ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE1-NEXT: fstpl (%ecx,%eax,8) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -733,9 +672,7 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %ecx ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE2-NEXT: movsd %xmm0, (%ecx,%eax,8) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -750,9 +687,7 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %ecx ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%ecx,%eax,8) +; X86-AVX-NEXT: vmovsd %xmm0, (%ecx,%eax,8) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -852,23 +787,17 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: movl 8(%ebp), %eax ; X86-NOSSE-NEXT: fildll (%eax) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fsubl 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %ecx, (%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%eax) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fsubl 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -881,16 +810,13 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind { ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: movl 8(%ebp), %eax ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fsubl 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%eax) +; X86-SSE1-NEXT: fstpl (%eax) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -904,9 +830,7 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: subsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: movsd %xmm0, (%eax) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -920,9 +844,7 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %eax ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vsubsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: vmovsd %xmm0, (%eax) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1018,23 +940,17 @@ define dso_local void @fsub_64g() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll glob64 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fld1 ; X86-NOSSE-NEXT: fchs -; X86-NOSSE-NEXT: faddl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll glob64 +; X86-NOSSE-NEXT: faddl (%esp) +; X86-NOSSE-NEXT: fstpl glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -1046,17 +962,14 @@ define dso_local void @fsub_64g() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fchs ; X86-SSE1-NEXT: faddl (%esp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, glob64 +; X86-SSE1-NEXT: fstpl glob64 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1069,9 +982,7 @@ define dso_local void @fsub_64g() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, glob64 +; X86-SSE2-NEXT: movsd %xmm0, glob64 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -1084,9 +995,7 @@ define dso_local void @fsub_64g() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, glob64 +; X86-AVX-NEXT: vmovsd %xmm0, glob64 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1184,23 +1093,17 @@ define dso_local void @fsub_64imm() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll -559038737 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fld1 ; X86-NOSSE-NEXT: fchs -; X86-NOSSE-NEXT: faddl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll -559038737 +; X86-NOSSE-NEXT: faddl (%esp) +; X86-NOSSE-NEXT: fstpl -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -1212,17 +1115,14 @@ define dso_local void @fsub_64imm() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fchs ; X86-SSE1-NEXT: faddl (%esp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, -559038737 +; X86-SSE1-NEXT: fstpl -559038737 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1235,9 +1135,7 @@ define dso_local void @fsub_64imm() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, -559038737 +; X86-SSE2-NEXT: movsd %xmm0, -559038737 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -1250,9 +1148,7 @@ define dso_local void @fsub_64imm() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, -559038737 +; X86-AVX-NEXT: vmovsd %xmm0, -559038737 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1354,22 +1250,16 @@ define dso_local void @fsub_64stack() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: fildll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fld1 -; X86-NOSSE-NEXT: fsubl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: fsubl (%esp) ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -1381,16 +1271,13 @@ define dso_local void @fsub_64stack() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fsubl (%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1405,8 +1292,6 @@ define dso_local void @fsub_64stack() nounwind { ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] ; X86-SSE2-NEXT: subsd %xmm0, %xmm1 ; X86-SSE2-NEXT: movsd %xmm1, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -1421,8 +1306,6 @@ define dso_local void @fsub_64stack() nounwind { ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] ; X86-AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1456,7 +1339,7 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: movl 20(%ebp), %eax ; X86-NOSSE-NEXT: movl 8(%ebp), %ecx ; X86-NOSSE-NEXT: fildll (%ecx,%eax,8) @@ -1464,16 +1347,10 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fsubl 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %edx, (%esp) -; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fsubl 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %ebp @@ -1488,16 +1365,13 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1-NEXT: movl 20(%ebp), %eax ; X86-SSE1-NEXT: movl 8(%ebp), %ecx ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fsubl 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE1-NEXT: fstpl (%ecx,%eax,8) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1512,9 +1386,7 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %ecx ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: subsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE2-NEXT: movsd %xmm0, (%ecx,%eax,8) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -1529,9 +1401,7 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %ecx ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vsubsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%ecx,%eax,8) +; X86-AVX-NEXT: vmovsd %xmm0, (%ecx,%eax,8) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1631,23 +1501,17 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: movl 8(%ebp), %eax ; X86-NOSSE-NEXT: fildll (%eax) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fmull 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %ecx, (%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%eax) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fmull 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -1660,16 +1524,13 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind { ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: movl 8(%ebp), %eax ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fmull 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%eax) +; X86-SSE1-NEXT: fstpl (%eax) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1683,9 +1544,7 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: mulsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: movsd %xmm0, (%eax) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -1699,9 +1558,7 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %eax ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vmulsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: vmovsd %xmm0, (%eax) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1794,22 +1651,16 @@ define dso_local void @fmul_64g() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll glob64 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll glob64 +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NOSSE-NEXT: fstpl glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -1821,16 +1672,13 @@ define dso_local void @fmul_64g() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, glob64 +; X86-SSE1-NEXT: fstpl glob64 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1843,9 +1691,7 @@ define dso_local void @fmul_64g() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, glob64 +; X86-SSE2-NEXT: movsd %xmm0, glob64 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -1858,9 +1704,7 @@ define dso_local void @fmul_64g() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, glob64 +; X86-AVX-NEXT: vmovsd %xmm0, glob64 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -1957,22 +1801,16 @@ define dso_local void @fmul_64imm() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll -559038737 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll -559038737 +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NOSSE-NEXT: fstpl -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -1984,16 +1822,13 @@ define dso_local void @fmul_64imm() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, -559038737 +; X86-SSE1-NEXT: fstpl -559038737 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2006,9 +1841,7 @@ define dso_local void @fmul_64imm() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, -559038737 +; X86-SSE2-NEXT: movsd %xmm0, -559038737 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2021,9 +1854,7 @@ define dso_local void @fmul_64imm() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, -559038737 +; X86-AVX-NEXT: vmovsd %xmm0, -559038737 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2125,22 +1956,16 @@ define dso_local void @fmul_64stack() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: fildll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) +; X86-NOSSE-NEXT: fldl (%esp) ; X86-NOSSE-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -2152,16 +1977,13 @@ define dso_local void @fmul_64stack() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2175,8 +1997,6 @@ define dso_local void @fmul_64stack() nounwind { ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2190,8 +2010,6 @@ define dso_local void @fmul_64stack() nounwind { ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2225,7 +2043,7 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: movl 20(%ebp), %eax ; X86-NOSSE-NEXT: movl 8(%ebp), %ecx ; X86-NOSSE-NEXT: fildll (%ecx,%eax,8) @@ -2233,16 +2051,10 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fmull 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %edx, (%esp) -; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fmull 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %ebp @@ -2257,16 +2069,13 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1-NEXT: movl 20(%ebp), %eax ; X86-SSE1-NEXT: movl 8(%ebp), %ecx ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fmull 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE1-NEXT: fstpl (%ecx,%eax,8) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2281,9 +2090,7 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %ecx ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: mulsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE2-NEXT: movsd %xmm0, (%ecx,%eax,8) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2298,9 +2105,7 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %ecx ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vmulsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%ecx,%eax,8) +; X86-AVX-NEXT: vmovsd %xmm0, (%ecx,%eax,8) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2400,23 +2205,17 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: movl 8(%ebp), %eax ; X86-NOSSE-NEXT: fildll (%eax) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fdivl 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl %ecx, (%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%eax) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fdivl 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -2429,16 +2228,13 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind { ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: movl 8(%ebp), %eax ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fdivl 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%eax) +; X86-SSE1-NEXT: fstpl (%eax) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2452,9 +2248,7 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: divsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%eax) +; X86-SSE2-NEXT: movsd %xmm0, (%eax) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2468,9 +2262,7 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %eax ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vdivsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: vmovsd %xmm0, (%eax) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2565,22 +2357,16 @@ define dso_local void @fdiv_64g() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll glob64 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fdivs {{\.?LCPI[0-9]+_[0-9]+}} -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll glob64 +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fdivs {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NOSSE-NEXT: fstpl glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -2592,16 +2378,13 @@ define dso_local void @fdiv_64g() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fdivs {{\.?LCPI[0-9]+_[0-9]+}} -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, glob64 +; X86-SSE1-NEXT: fstpl glob64 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2614,9 +2397,7 @@ define dso_local void @fdiv_64g() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, glob64 +; X86-SSE2-NEXT: movsd %xmm0, glob64 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2629,9 +2410,7 @@ define dso_local void @fdiv_64g() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vdivsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, glob64 +; X86-AVX-NEXT: vmovsd %xmm0, glob64 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2728,22 +2507,16 @@ define dso_local void @fdiv_64imm() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $32, %esp +; X86-NOSSE-NEXT: subl $24, %esp ; X86-NOSSE-NEXT: fildll -559038737 ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fdivs {{\.?LCPI[0-9]+_[0-9]+}} -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll -559038737 +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fdivs {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NOSSE-NEXT: fstpl -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -2755,16 +2528,13 @@ define dso_local void @fdiv_64imm() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fdivs {{\.?LCPI[0-9]+_[0-9]+}} -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, -559038737 +; X86-SSE1-NEXT: fstpl -559038737 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2777,9 +2547,7 @@ define dso_local void @fdiv_64imm() nounwind { ; X86-SSE2-NEXT: subl $8, %esp ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, -559038737 +; X86-SSE2-NEXT: movsd %xmm0, -559038737 ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2792,9 +2560,7 @@ define dso_local void @fdiv_64imm() nounwind { ; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vdivsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, -559038737 +; X86-AVX-NEXT: vmovsd %xmm0, -559038737 ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2896,22 +2662,16 @@ define dso_local void @fdiv_64stack() nounwind { ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: fildll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fld1 -; X86-NOSSE-NEXT: fdivl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: fdivl (%esp) ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl @@ -2923,16 +2683,13 @@ define dso_local void @fdiv_64stack() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $24, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fdivl (%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -2947,8 +2704,6 @@ define dso_local void @fdiv_64stack() nounwind { ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] ; X86-SSE2-NEXT: divsd %xmm0, %xmm1 ; X86-SSE2-NEXT: movsd %xmm1, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -2963,8 +2718,6 @@ define dso_local void @fdiv_64stack() nounwind { ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] ; X86-AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl @@ -2998,7 +2751,7 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: andl $-8, %esp -; X86-NOSSE-NEXT: subl $40, %esp +; X86-NOSSE-NEXT: subl $32, %esp ; X86-NOSSE-NEXT: movl 20(%ebp), %eax ; X86-NOSSE-NEXT: movl 8(%ebp), %ecx ; X86-NOSSE-NEXT: fildll (%ecx,%eax,8) @@ -3006,16 +2759,10 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fdivl 12(%ebp) -; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOSSE-NEXT: movl %edx, (%esp) -; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fildll (%esp) -; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) +; X86-NOSSE-NEXT: fldl (%esp) +; X86-NOSSE-NEXT: fdivl 12(%ebp) +; X86-NOSSE-NEXT: fstpl (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %ebp @@ -3030,16 +2777,13 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1-NEXT: movl 20(%ebp), %eax ; X86-SSE1-NEXT: movl 8(%ebp), %ecx ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: fdivl 12(%ebp) -; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE1-NEXT: fstpl (%ecx,%eax,8) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -3054,9 +2798,7 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE2-NEXT: movl 8(%ebp), %ecx ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-NEXT: divsd 12(%ebp), %xmm0 -; X86-SSE2-NEXT: movsd %xmm0, (%esp) -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movlps %xmm0, (%ecx,%eax,8) +; X86-SSE2-NEXT: movsd %xmm0, (%ecx,%eax,8) ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl @@ -3071,9 +2813,7 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind { ; X86-AVX-NEXT: movl 8(%ebp), %ecx ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX-NEXT: vdivsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%ecx,%eax,8) +; X86-AVX-NEXT: vmovsd %xmm0, (%ecx,%eax,8) ; X86-AVX-NEXT: movl %ebp, %esp ; X86-AVX-NEXT: popl %ebp ; X86-AVX-NEXT: retl From 4d335cb5821b02bdc589b8483386a9ef91bbc070 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Thu, 11 Dec 2025 09:21:08 +0000 Subject: [PATCH 35/49] [AArch64] Fix scheduling info for Armv8.4-a LDAPUR* instructions (#171637) They were using the wrong scheduler resource. They're also missing from the optimisation guides, but WriteLD should be closer at least. --- .../lib/Target/AArch64/AArch64InstrFormats.td | 2 +- .../Neoverse/N2-rcpc-immo-instructions.s | 38 +++++++++---------- .../Neoverse/N3-rcpc-immo-instructions.s | 38 +++++++++---------- .../Neoverse/V1-rcpc-immo-instructions.s | 38 +++++++++---------- .../Neoverse/V2-rcpc-immo-instructions.s | 38 +++++++++---------- .../Neoverse/V3-rcpc-immo-instructions.s | 38 +++++++++---------- .../Neoverse/V3AE-rcpc-immo-instructions.s | 38 +++++++++---------- 7 files changed, 115 insertions(+), 115 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 4d2e740779961..892b8ee1ed3cb 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -4386,7 +4386,7 @@ multiclass BaseLoadUnscaleV84 sz, bits<2> opc, DAGOperand regtype > { def i : BaseLoadStoreUnscale, - Sched<[WriteST]> { + Sched<[WriteLD]> { let Inst{29} = 0; let Inst{24} = 1; } diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s index cd3d7e0bf1b57..d9943f342b827 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-rcpc-immo-instructions.s @@ -10,15 +10,15 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 0.50 * ldapur w7, [x24] -# CHECK-NEXT: 2 1 0.50 * ldapur x20, [x13] -# CHECK-NEXT: 2 1 0.50 * ldapurb w13, [x17] -# CHECK-NEXT: 2 1 0.50 * ldapurh w3, [x22] -# CHECK-NEXT: 2 1 0.50 U ldapursb w7, [x8] -# CHECK-NEXT: 2 1 0.50 U ldapursb x29, [x7] -# CHECK-NEXT: 2 1 0.50 U ldapursh w17, [x19] -# CHECK-NEXT: 2 1 0.50 U ldapursh x3, [x3] -# CHECK-NEXT: 2 1 0.50 U ldapursw x3, [x18] +# CHECK-NEXT: 1 4 0.33 * ldapur w7, [x24] +# CHECK-NEXT: 1 4 0.33 * ldapur x20, [x13] +# CHECK-NEXT: 1 4 0.33 * ldapurb w13, [x17] +# CHECK-NEXT: 1 4 0.33 * ldapurh w3, [x22] +# CHECK-NEXT: 1 4 0.33 U ldapursb w7, [x8] +# CHECK-NEXT: 1 4 0.33 U ldapursb x29, [x7] +# CHECK-NEXT: 1 4 0.33 U ldapursh w17, [x19] +# CHECK-NEXT: 1 4 0.33 U ldapursh x3, [x3] +# CHECK-NEXT: 1 4 0.33 U ldapursw x3, [x18] # CHECK-NEXT: 2 1 0.50 * stlur w3, [x27] # CHECK-NEXT: 2 1 0.50 * stlur x23, [x25] # CHECK-NEXT: 2 1 0.50 * stlurb w30, [x17] @@ -41,19 +41,19 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] -# CHECK-NEXT: - - 6.50 6.50 - 6.50 6.50 - - - - - - +# CHECK-NEXT: - - 2.00 2.00 3.00 5.00 5.00 - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] Instructions: -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapur w7, [x24] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapur x20, [x13] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapurb w13, [x17] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapurh w3, [x22] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursb w7, [x8] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursb x29, [x7] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursh w17, [x19] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursh x3, [x3] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursw x3, [x18] # CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - stlur w3, [x27] # CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - stlur x23, [x25] # CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - stlurb w30, [x17] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s index 6faa5e1f4db1b..d5302e96edf4e 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-rcpc-immo-instructions.s @@ -10,15 +10,15 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 0.50 * ldapur w7, [x24] -# CHECK-NEXT: 2 1 0.50 * ldapur x20, [x13] -# CHECK-NEXT: 2 1 0.50 * ldapurb w13, [x17] -# CHECK-NEXT: 2 1 0.50 * ldapurh w3, [x22] -# CHECK-NEXT: 2 1 0.50 U ldapursb w7, [x8] -# CHECK-NEXT: 2 1 0.50 U ldapursb x29, [x7] -# CHECK-NEXT: 2 1 0.50 U ldapursh w17, [x19] -# CHECK-NEXT: 2 1 0.50 U ldapursh x3, [x3] -# CHECK-NEXT: 2 1 0.50 U ldapursw x3, [x18] +# CHECK-NEXT: 1 4 0.33 * ldapur w7, [x24] +# CHECK-NEXT: 1 4 0.33 * ldapur x20, [x13] +# CHECK-NEXT: 1 4 0.33 * ldapurb w13, [x17] +# CHECK-NEXT: 1 4 0.33 * ldapurh w3, [x22] +# CHECK-NEXT: 1 4 0.33 U ldapursb w7, [x8] +# CHECK-NEXT: 1 4 0.33 U ldapursb x29, [x7] +# CHECK-NEXT: 1 4 0.33 U ldapursh w17, [x19] +# CHECK-NEXT: 1 4 0.33 U ldapursh x3, [x3] +# CHECK-NEXT: 1 4 0.33 U ldapursw x3, [x18] # CHECK-NEXT: 2 1 0.50 * stlur w3, [x27] # CHECK-NEXT: 2 1 0.50 * stlur x23, [x25] # CHECK-NEXT: 2 1 0.50 * stlurb w30, [x17] @@ -41,19 +41,19 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] -# CHECK-NEXT: - - 6.50 6.50 - 6.50 6.50 - - - - - - +# CHECK-NEXT: - - 2.00 2.00 3.00 5.00 5.00 - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] Instructions: -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapur w7, [x24] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapur x20, [x13] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapurb w13, [x17] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapurh w3, [x22] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursb w7, [x8] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursb x29, [x7] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursh w17, [x19] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursh x3, [x3] -# CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - 0.33 0.33 0.33 - - - - - - ldapursw x3, [x18] # CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - stlur w3, [x27] # CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - stlur x23, [x25] # CHECK-NEXT: - - 0.50 0.50 - 0.50 0.50 - - - - - - stlurb w30, [x17] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s index 5c9b43a0e5121..dcea382de5fa9 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-rcpc-immo-instructions.s @@ -10,15 +10,15 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 0.50 * ldapur w7, [x24] -# CHECK-NEXT: 2 1 0.50 * ldapur x20, [x13] -# CHECK-NEXT: 2 1 0.50 * ldapurb w13, [x17] -# CHECK-NEXT: 2 1 0.50 * ldapurh w3, [x22] -# CHECK-NEXT: 2 1 0.50 U ldapursb w7, [x8] -# CHECK-NEXT: 2 1 0.50 U ldapursb x29, [x7] -# CHECK-NEXT: 2 1 0.50 U ldapursh w17, [x19] -# CHECK-NEXT: 2 1 0.50 U ldapursh x3, [x3] -# CHECK-NEXT: 2 1 0.50 U ldapursw x3, [x18] +# CHECK-NEXT: 1 4 0.33 * ldapur w7, [x24] +# CHECK-NEXT: 1 4 0.33 * ldapur x20, [x13] +# CHECK-NEXT: 1 4 0.33 * ldapurb w13, [x17] +# CHECK-NEXT: 1 4 0.33 * ldapurh w3, [x22] +# CHECK-NEXT: 1 4 0.33 U ldapursb w7, [x8] +# CHECK-NEXT: 1 4 0.33 U ldapursb x29, [x7] +# CHECK-NEXT: 1 4 0.33 U ldapursh w17, [x19] +# CHECK-NEXT: 1 4 0.33 U ldapursh x3, [x3] +# CHECK-NEXT: 1 4 0.33 U ldapursw x3, [x18] # CHECK-NEXT: 2 1 0.50 * stlur w3, [x27] # CHECK-NEXT: 2 1 0.50 * stlur x23, [x25] # CHECK-NEXT: 2 1 0.50 * stlurb w30, [x17] @@ -46,19 +46,19 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7.0] [7.1] [8] [9] [10] [11] -# CHECK-NEXT: - - 6.50 6.50 - - - - 6.50 6.50 - - - - - - - - +# CHECK-NEXT: - - 2.00 2.00 - - - 3.00 5.00 5.00 - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7.0] [7.1] [8] [9] [10] [11] Instructions: -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapur w7, [x24] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapur x20, [x13] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapurb w13, [x17] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapurh w3, [x22] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapursb w7, [x8] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapursb x29, [x7] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapursh w17, [x19] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapursh x3, [x3] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - ldapursw x3, [x18] # CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - stlur w3, [x27] # CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - stlur x23, [x25] # CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - stlurb w30, [x17] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s index 71fd689522215..dfcc202192392 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-rcpc-immo-instructions.s @@ -10,15 +10,15 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 0.50 * ldapur w7, [x24] -# CHECK-NEXT: 2 1 0.50 * ldapur x20, [x13] -# CHECK-NEXT: 2 1 0.50 * ldapurb w13, [x17] -# CHECK-NEXT: 2 1 0.50 * ldapurh w3, [x22] -# CHECK-NEXT: 2 1 0.50 U ldapursb w7, [x8] -# CHECK-NEXT: 2 1 0.50 U ldapursb x29, [x7] -# CHECK-NEXT: 2 1 0.50 U ldapursh w17, [x19] -# CHECK-NEXT: 2 1 0.50 U ldapursh x3, [x3] -# CHECK-NEXT: 2 1 0.50 U ldapursw x3, [x18] +# CHECK-NEXT: 1 4 0.33 * ldapur w7, [x24] +# CHECK-NEXT: 1 4 0.33 * ldapur x20, [x13] +# CHECK-NEXT: 1 4 0.33 * ldapurb w13, [x17] +# CHECK-NEXT: 1 4 0.33 * ldapurh w3, [x22] +# CHECK-NEXT: 1 4 0.33 U ldapursb w7, [x8] +# CHECK-NEXT: 1 4 0.33 U ldapursb x29, [x7] +# CHECK-NEXT: 1 4 0.33 U ldapursh w17, [x19] +# CHECK-NEXT: 1 4 0.33 U ldapursh x3, [x3] +# CHECK-NEXT: 1 4 0.33 U ldapursw x3, [x18] # CHECK-NEXT: 2 1 0.50 * stlur w3, [x27] # CHECK-NEXT: 2 1 0.50 * stlur x23, [x25] # CHECK-NEXT: 2 1 0.50 * stlurb w30, [x17] @@ -48,19 +48,19 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] -# CHECK-NEXT: - - 6.50 6.50 - - - - 6.50 6.50 - - - - - - - - - - +# CHECK-NEXT: - - 2.00 2.00 - - - 3.00 5.00 5.00 - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] Instructions: -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapur w7, [x24] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapur x20, [x13] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapurb w13, [x17] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapurh w3, [x22] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapursb w7, [x8] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapursb x29, [x7] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapursh w17, [x19] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapursh x3, [x3] -# CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - ldapursw x3, [x18] # CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - stlur w3, [x27] # CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - stlur x23, [x25] # CHECK-NEXT: - - 0.50 0.50 - - - - 0.50 0.50 - - - - - - - - - - stlurb w30, [x17] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s index a48978ce8b94d..4fff7670058bb 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-rcpc-immo-instructions.s @@ -10,15 +10,15 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 0.50 * ldapur w7, [x24] -# CHECK-NEXT: 2 1 0.50 * ldapur x20, [x13] -# CHECK-NEXT: 2 1 0.50 * ldapurb w13, [x17] -# CHECK-NEXT: 2 1 0.50 * ldapurh w3, [x22] -# CHECK-NEXT: 2 1 0.50 U ldapursb w7, [x8] -# CHECK-NEXT: 2 1 0.50 U ldapursb x29, [x7] -# CHECK-NEXT: 2 1 0.50 U ldapursh w17, [x19] -# CHECK-NEXT: 2 1 0.50 U ldapursh x3, [x3] -# CHECK-NEXT: 2 1 0.50 U ldapursw x3, [x18] +# CHECK-NEXT: 1 4 0.33 * ldapur w7, [x24] +# CHECK-NEXT: 1 4 0.33 * ldapur x20, [x13] +# CHECK-NEXT: 1 4 0.33 * ldapurb w13, [x17] +# CHECK-NEXT: 1 4 0.33 * ldapurh w3, [x22] +# CHECK-NEXT: 1 4 0.33 U ldapursb w7, [x8] +# CHECK-NEXT: 1 4 0.33 U ldapursb x29, [x7] +# CHECK-NEXT: 1 4 0.33 U ldapursh w17, [x19] +# CHECK-NEXT: 1 4 0.33 U ldapursh x3, [x3] +# CHECK-NEXT: 1 4 0.33 U ldapursw x3, [x18] # CHECK-NEXT: 2 1 0.50 * stlur w3, [x27] # CHECK-NEXT: 2 1 0.50 * stlur x23, [x25] # CHECK-NEXT: 2 1 0.50 * stlurb w30, [x17] @@ -53,19 +53,19 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [0.2] [1.0] [1.1] [2.0] [2.1] [2.2] [2.3] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15] [16] [17] -# CHECK-NEXT: - - - 6.50 6.50 - - - - - - 6.50 - - - - - - - - 6.50 - - - - +# CHECK-NEXT: - - - 2.00 2.00 - - - - 3.00 3.00 5.00 - - - - - - - - 2.00 - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [0.2] [1.0] [1.1] [2.0] [2.1] [2.2] [2.3] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15] [16] [17] Instructions: -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapur w7, [x24] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapur x20, [x13] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapurb w13, [x17] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapurh w3, [x22] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapursb w7, [x8] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapursb x29, [x7] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapursh w17, [x19] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapursh x3, [x3] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - - - ldapursw x3, [x18] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - stlur w3, [x27] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - stlur x23, [x25] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - - - stlurb w30, [x17] diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s index f801a18bc7a06..dc064d6ea3f3f 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V3AE-rcpc-immo-instructions.s @@ -10,15 +10,15 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 0.50 * ldapur w7, [x24] -# CHECK-NEXT: 2 1 0.50 * ldapur x20, [x13] -# CHECK-NEXT: 2 1 0.50 * ldapurb w13, [x17] -# CHECK-NEXT: 2 1 0.50 * ldapurh w3, [x22] -# CHECK-NEXT: 2 1 0.50 U ldapursb w7, [x8] -# CHECK-NEXT: 2 1 0.50 U ldapursb x29, [x7] -# CHECK-NEXT: 2 1 0.50 U ldapursh w17, [x19] -# CHECK-NEXT: 2 1 0.50 U ldapursh x3, [x3] -# CHECK-NEXT: 2 1 0.50 U ldapursw x3, [x18] +# CHECK-NEXT: 1 4 0.33 * ldapur w7, [x24] +# CHECK-NEXT: 1 4 0.33 * ldapur x20, [x13] +# CHECK-NEXT: 1 4 0.33 * ldapurb w13, [x17] +# CHECK-NEXT: 1 4 0.33 * ldapurh w3, [x22] +# CHECK-NEXT: 1 4 0.33 U ldapursb w7, [x8] +# CHECK-NEXT: 1 4 0.33 U ldapursb x29, [x7] +# CHECK-NEXT: 1 4 0.33 U ldapursh w17, [x19] +# CHECK-NEXT: 1 4 0.33 U ldapursh x3, [x3] +# CHECK-NEXT: 1 4 0.33 U ldapursw x3, [x18] # CHECK-NEXT: 2 1 0.50 * stlur w3, [x27] # CHECK-NEXT: 2 1 0.50 * stlur x23, [x25] # CHECK-NEXT: 2 1 0.50 * stlurb w30, [x17] @@ -51,19 +51,19 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [0.2] [1.0] [1.1] [2.0] [2.1] [2.2] [2.3] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15] -# CHECK-NEXT: - - - 6.50 6.50 - - - - - - 6.50 - - - - - - - - 6.50 - - +# CHECK-NEXT: - - - 2.00 2.00 - - - - 3.00 3.00 5.00 - - - - - - - - 2.00 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [0.2] [1.0] [1.1] [2.0] [2.1] [2.2] [2.3] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15] Instructions: -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapur w7, [x24] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapur x20, [x13] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapurb w13, [x17] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapurh w3, [x22] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapursb w7, [x8] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapursb x29, [x7] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapursh w17, [x19] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapursh x3, [x3] -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - ldapursw x3, [x18] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapur w7, [x24] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapur x20, [x13] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapurb w13, [x17] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapurh w3, [x22] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapursb w7, [x8] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapursb x29, [x7] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapursh w17, [x19] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapursh x3, [x3] +# CHECK-NEXT: - - - - - - - - - 0.33 0.33 0.33 - - - - - - - - - - - ldapursw x3, [x18] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - stlur w3, [x27] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - stlur x23, [x25] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 - - - - - - - - 0.50 - - stlurb w30, [x17] From df6c27e752e17bc68c1c962f159a8fde796071f3 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 11 Dec 2025 10:27:43 +0100 Subject: [PATCH 36/49] [libc++] Make std::allocator always trivially default constructible (#169914) This is technically ABI breaking, since `is_trivial` and `is_trivially_default_constructible` now return different results. However, I don't think that's a significant issue, since `allocator` is almost always used in classes which own memory, making them non-trivial anyways. --- libcxx/docs/ReleaseNotes/22.rst | 4 +++ libcxx/include/__memory/allocator.h | 33 ++++++------------- .../allocator_triviality.compile.pass.cpp | 24 ++++++++++++++ ...triviality.deprecated_abi.compile.pass.cpp | 27 +++++++++++++++ .../allocator_void.trivial.compile.pass.cpp | 26 --------------- .../make_optional_explicit.pass.cpp | 3 -- ...ptional_explicit_initializer_list.pass.cpp | 3 -- 7 files changed, 65 insertions(+), 55 deletions(-) create mode 100644 libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp create mode 100644 libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp delete mode 100644 libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst index 56eb0e588d81d..f1912668e4013 100644 --- a/libcxx/docs/ReleaseNotes/22.rst +++ b/libcxx/docs/ReleaseNotes/22.rst @@ -122,5 +122,9 @@ ABI Affecting Changes - ``ranges::iota_view`` is now aware of ``__int128``. This causes ``iota_view::difference_type`` to change from ``long long`` to ``__int128`` in some cases. +- ``std::allocator`` is now trivially default constructible. The behaviour can be reverted by defining + ``_LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR``. Please inform the libc++ team if you need this flag, since it will + be removed in LLVM 24 if there is no evidence that it's required. + Build System Changes -------------------- diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h index 52f4122a9bf5f..1c96a2ab64578 100644 --- a/libcxx/include/__memory/allocator.h +++ b/libcxx/include/__memory/allocator.h @@ -14,7 +14,6 @@ #include <__cstddef/ptrdiff_t.h> #include <__cstddef/size_t.h> #include <__memory/addressof.h> -#include <__memory/allocate_at_least.h> #include <__memory/allocator_traits.h> #include <__new/allocate.h> #include <__new/exceptions.h> @@ -51,33 +50,21 @@ class allocator { }; #endif // _LIBCPP_STD_VER <= 17 -// This class provides a non-trivial default constructor to the class that derives from it -// if the condition is satisfied. -// -// The second template parameter exists to allow giving a unique type to __non_trivial_if, -// which makes it possible to avoid breaking the ABI when making this a base class of an -// existing class. Without that, imagine we have classes D1 and D2, both of which used to -// have no base classes, but which now derive from __non_trivial_if. The layout of a class -// that inherits from both D1 and D2 will change because the two __non_trivial_if base -// classes are not allowed to share the same address. -// -// By making those __non_trivial_if base classes unique, we work around this problem and -// it is safe to start deriving from __non_trivial_if in existing classes. -template -struct __non_trivial_if {}; +template +struct __non_trivially_default_constructible_if {}; template -struct __non_trivial_if { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __non_trivial_if() _NOEXCEPT {} +struct __non_trivially_default_constructible_if { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __non_trivially_default_constructible_if() {} }; -// allocator -// -// Note: For ABI compatibility between C++20 and previous standards, we make -// allocator trivial in C++20. - template -class allocator : private __non_trivial_if::value, allocator<_Tp> > { +class allocator +// TODO(LLVM 24): Remove the opt-out +#ifdef _LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR + : __non_trivially_default_constructible_if::value, allocator<_Tp> > +#endif +{ static_assert(!is_const<_Tp>::value, "std::allocator does not support const types"); static_assert(!is_volatile<_Tp>::value, "std::allocator does not support volatile types"); diff --git a/libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp new file mode 100644 index 0000000000000..ff298963e074a --- /dev/null +++ b/libcxx/test/libcxx/memory/allocator_triviality.compile.pass.cpp @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +// Make sure that std::allocator is trivial. + +// + +#include +#include +#include + +static_assert(std::is_trivially_default_constructible >::value, ""); +static_assert(std::is_trivially_default_constructible >::value, ""); +static_assert(std::is_trivially_default_constructible >::value, ""); + +static_assert(std::is_trivially_copyable >::value, ""); +static_assert(std::is_trivially_copyable >::value, ""); +static_assert(std::is_trivially_copyable >::value, ""); diff --git a/libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp new file mode 100644 index 0000000000000..be2a1840ec903 --- /dev/null +++ b/libcxx/test/libcxx/memory/allocator_triviality.deprecated_abi.compile.pass.cpp @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +// Make sure that std::allocator is not trivial if _LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR if defined. +// std::allocator _should_ still be trivial, since it has always been trivial. + +// + +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DEPRECATED_ABI_NON_TRIVIAL_ALLOCATOR + +#include +#include +#include + +static_assert(!std::is_trivially_default_constructible >::value, ""); +static_assert(!std::is_trivially_default_constructible >::value, ""); +static_assert(std::is_trivially_default_constructible >::value, ""); + +static_assert(std::is_trivially_copyable >::value, ""); +static_assert(std::is_trivially_copyable >::value, ""); +static_assert(std::is_trivially_copyable >::value, ""); diff --git a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp deleted file mode 100644 index b7dfc190e8e91..0000000000000 --- a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// Make sure that std::allocator is trivial. This was the case before C++20 -// with the std::allocator explicit specialization, and this test makes sure -// that we maintain that property across all standards. -// -// This is important since triviality has implications on how the type is passed -// as a function argument in the ABI. - -#include -#include - -typedef std::allocator A1; -struct A2 : std::allocator { }; - -static_assert(std::is_trivially_default_constructible::value, ""); -static_assert(std::is_trivially_copyable::value, ""); - -static_assert(std::is_trivially_default_constructible::value, ""); -static_assert(std::is_trivially_copyable::value, ""); diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp index 5dd1d6f0b3380..b08fce2b701e2 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp @@ -12,9 +12,6 @@ // template // constexpr optional make_optional(Args&&... args); -// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 -// XFAIL: gcc-15 - #include #include #include diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp index 5ddb229ad9268..80371d6333712 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp @@ -12,9 +12,6 @@ // template // constexpr optional make_optional(initializer_list il, Args&&... args); -// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 -// XFAIL: gcc-15 - #include #include #include From 57cf1ff8d8a1acaf949d866b52b7c8841a49f46d Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 11 Dec 2025 10:28:58 +0100 Subject: [PATCH 37/49] [libc++] Remove initializer_list specific optimization in __tree (#169413) We've seen in quite a few cases while optimizing `__tree`'s copy construction that `_DetachedTreeCache` is actually quite slow and not necessarily an optimization at all. This patch removes the code, since it's now only used by `operator=(initializer_list)`, which should be quite cold code. We might look into actually optimizing it again in the future, but I doubt an optimization will be small enough compared to the likely speedup in real-world code this would give. --- libcxx/include/__tree | 158 ------------------------------------------ libcxx/include/map | 6 +- libcxx/include/set | 6 +- 3 files changed, 8 insertions(+), 162 deletions(-) diff --git a/libcxx/include/__tree b/libcxx/include/__tree index ceae22bb48702..f8064106de075 100644 --- a/libcxx/include/__tree +++ b/libcxx/include/__tree @@ -902,8 +902,6 @@ public: _LIBCPP_HIDE_FROM_ABI __tree& operator=(const __tree& __t); template _LIBCPP_HIDE_FROM_ABI void __assign_unique(_ForwardIterator __first, _ForwardIterator __last); - template - _LIBCPP_HIDE_FROM_ABI void __assign_multi(_InputIterator __first, _InputIterator __last); _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t) _NOEXCEPT_( is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible::value); _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t, const allocator_type& __a); @@ -1036,11 +1034,6 @@ public: } } - _LIBCPP_HIDE_FROM_ABI pair __node_assign_unique(const value_type& __v, __node_pointer __dest); - - _LIBCPP_HIDE_FROM_ABI iterator __node_insert_multi(__node_pointer __nd); - _LIBCPP_HIDE_FROM_ABI iterator __node_insert_multi(const_iterator __p, __node_pointer __nd); - template _LIBCPP_HIDE_FROM_ABI void __insert_range_unique(_InIter __first, _Sent __last) { if (__first == __last) @@ -1311,43 +1304,6 @@ private: __lhs = std::forward<_From>(__rhs); } - struct _DetachedTreeCache { - _LIBCPP_HIDE_FROM_ABI explicit _DetachedTreeCache(__tree* __t) _NOEXCEPT - : __t_(__t), - __cache_root_(__detach_from_tree(__t)) { - __advance(); - } - - _LIBCPP_HIDE_FROM_ABI __node_pointer __get() const _NOEXCEPT { return __cache_elem_; } - - _LIBCPP_HIDE_FROM_ABI void __advance() _NOEXCEPT { - __cache_elem_ = __cache_root_; - if (__cache_root_) { - __cache_root_ = __detach_next(__cache_root_); - } - } - - _LIBCPP_HIDE_FROM_ABI ~_DetachedTreeCache() { - __t_->destroy(__cache_elem_); - if (__cache_root_) { - while (__cache_root_->__parent_ != nullptr) - __cache_root_ = static_cast<__node_pointer>(__cache_root_->__parent_); - __t_->destroy(__cache_root_); - } - } - - _DetachedTreeCache(_DetachedTreeCache const&) = delete; - _DetachedTreeCache& operator=(_DetachedTreeCache const&) = delete; - - private: - _LIBCPP_HIDE_FROM_ABI static __node_pointer __detach_from_tree(__tree* __t) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI static __node_pointer __detach_next(__node_pointer) _NOEXCEPT; - - __tree* __t_; - __node_pointer __cache_root_; - __node_pointer __cache_elem_; - }; - class __tree_deleter { __node_allocator& __alloc_; @@ -1486,47 +1442,6 @@ private: } }; -// Precondition: __size_ != 0 -template -typename __tree<_Tp, _Compare, _Allocator>::__node_pointer -__tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_from_tree(__tree* __t) _NOEXCEPT { - __node_pointer __cache = static_cast<__node_pointer>(__t->__begin_node_); - __t->__begin_node_ = __t->__end_node(); - __t->__end_node()->__left_->__parent_ = nullptr; - __t->__end_node()->__left_ = nullptr; - __t->__size_ = 0; - // __cache->__left_ == nullptr - if (__cache->__right_ != nullptr) - __cache = static_cast<__node_pointer>(__cache->__right_); - // __cache->__left_ == nullptr - // __cache->__right_ == nullptr - return __cache; -} - -// Precondition: __cache != nullptr -// __cache->left_ == nullptr -// __cache->right_ == nullptr -// This is no longer a red-black tree -template -typename __tree<_Tp, _Compare, _Allocator>::__node_pointer -__tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_next(__node_pointer __cache) _NOEXCEPT { - if (__cache->__parent_ == nullptr) - return nullptr; - if (std::__tree_is_left_child(static_cast<__node_base_pointer>(__cache))) { - __cache->__parent_->__left_ = nullptr; - __cache = static_cast<__node_pointer>(__cache->__parent_); - if (__cache->__right_ == nullptr) - return __cache; - return static_cast<__node_pointer>(std::__tree_leaf(__cache->__right_)); - } - // __cache is right child - __cache->__parent_unsafe()->__right_ = nullptr; - __cache = static_cast<__node_pointer>(__cache->__parent_); - if (__cache->__left_ == nullptr) - return __cache; - return static_cast<__node_pointer>(std::__tree_leaf(__cache->__left_)); -} - template __tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(const __tree& __t) { if (this == std::addressof(__t)) @@ -1549,46 +1464,6 @@ __tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=( return *this; } -template -template -void __tree<_Tp, _Compare, _Allocator>::__assign_unique(_ForwardIterator __first, _ForwardIterator __last) { - using _ITraits = iterator_traits<_ForwardIterator>; - using _ItValueType = typename _ITraits::value_type; - static_assert( - is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type"); - static_assert( - __has_forward_iterator_category<_ForwardIterator>::value, "__assign_unique requires a forward iterator"); - if (__size_ != 0) { - _DetachedTreeCache __cache(this); - for (; __cache.__get() != nullptr && __first != __last; ++__first) { - if (__node_assign_unique(*__first, __cache.__get()).second) - __cache.__advance(); - } - } - for (; __first != __last; ++__first) - __emplace_unique(*__first); -} - -template -template -void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _InputIterator __last) { - using _ITraits = iterator_traits<_InputIterator>; - using _ItValueType = typename _ITraits::value_type; - static_assert( - is_same<_ItValueType, value_type>::value, "__assign_multi may only be called with the containers value_type"); - if (__size_ != 0) { - _DetachedTreeCache __cache(this); - for (; __cache.__get() && __first != __last; ++__first) { - __assign_value(__cache.__get()->__get_value(), *__first); - __node_insert_multi(__cache.__get()); - __cache.__advance(); - } - } - const_iterator __e = end(); - for (; __first != __last; ++__first) - __emplace_hint_multi(__e, *__first); -} - template __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t) : __begin_node_(__end_node()), @@ -1942,39 +1817,6 @@ __tree<_Tp, _Compare, _Allocator>::__emplace_hint_multi(const_iterator __p, _Arg return iterator(static_cast<__node_pointer>(__h.release())); } -template -pair::iterator, bool> -__tree<_Tp, _Compare, _Allocator>::__node_assign_unique(const value_type& __v, __node_pointer __nd) { - auto [__parent, __child] = __find_equal(__v); - __node_pointer __r = static_cast<__node_pointer>(__child); - bool __inserted = false; - if (__child == nullptr) { - __assign_value(__nd->__get_value(), __v); - __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd)); - __r = __nd; - __inserted = true; - } - return pair(iterator(__r), __inserted); -} - -template -typename __tree<_Tp, _Compare, _Allocator>::iterator -__tree<_Tp, _Compare, _Allocator>::__node_insert_multi(__node_pointer __nd) { - __end_node_pointer __parent; - __node_base_pointer& __child = __find_leaf_high(__parent, __nd->__get_value()); - __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd)); - return iterator(__nd); -} - -template -typename __tree<_Tp, _Compare, _Allocator>::iterator -__tree<_Tp, _Compare, _Allocator>::__node_insert_multi(const_iterator __p, __node_pointer __nd) { - __end_node_pointer __parent; - __node_base_pointer& __child = __find_leaf(__p, __parent, __nd->__get_value()); - __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd)); - return iterator(__nd); -} - template typename __tree<_Tp, _Compare, _Allocator>::iterator __tree<_Tp, _Compare, _Allocator>::__remove_node_pointer(__node_pointer __ptr) _NOEXCEPT { diff --git a/libcxx/include/map b/libcxx/include/map index 0dca11cabd12e..e67f7cef5861d 100644 --- a/libcxx/include/map +++ b/libcxx/include/map @@ -1015,7 +1015,8 @@ public: # endif _LIBCPP_HIDE_FROM_ABI map& operator=(initializer_list __il) { - __tree_.__assign_unique(__il.begin(), __il.end()); + clear(); + insert(__il.begin(), __il.end()); return *this; } @@ -1689,7 +1690,8 @@ public: # endif _LIBCPP_HIDE_FROM_ABI multimap& operator=(initializer_list __il) { - __tree_.__assign_multi(__il.begin(), __il.end()); + clear(); + insert(__il.begin(), __il.end()); return *this; } diff --git a/libcxx/include/set b/libcxx/include/set index 3d6f571a42a1a..f333d97defac1 100644 --- a/libcxx/include/set +++ b/libcxx/include/set @@ -692,7 +692,8 @@ public: # endif _LIBCPP_HIDE_FROM_ABI set& operator=(initializer_list __il) { - __tree_.__assign_unique(__il.begin(), __il.end()); + clear(); + insert(__il.begin(), __il.end()); return *this; } @@ -1136,7 +1137,8 @@ public: # endif _LIBCPP_HIDE_FROM_ABI multiset& operator=(initializer_list __il) { - __tree_.__assign_multi(__il.begin(), __il.end()); + clear(); + insert(__il.begin(), __il.end()); return *this; } From d15ff5980bf007d2d26258d65fdae4484efd273f Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 11 Dec 2025 10:32:24 +0100 Subject: [PATCH 38/49] [libc++] Merge the segmented iterator code for {copy,move}_backward (#165160) This removes a bit of code duplication and might simplify future segmented iterator optimitations. --- libcxx/include/__algorithm/copy_backward.h | 26 ++++--------------- libcxx/include/__algorithm/for_each_segment.h | 26 +++++++++++++++++++ libcxx/include/__algorithm/move_backward.h | 26 ++++--------------- 3 files changed, 36 insertions(+), 42 deletions(-) diff --git a/libcxx/include/__algorithm/copy_backward.h b/libcxx/include/__algorithm/copy_backward.h index 6c9eba672e154..8758d2c9e7b5d 100644 --- a/libcxx/include/__algorithm/copy_backward.h +++ b/libcxx/include/__algorithm/copy_backward.h @@ -11,6 +11,7 @@ #include <__algorithm/copy_move_common.h> #include <__algorithm/copy_n.h> +#include <__algorithm/for_each_segment.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> @@ -173,27 +174,10 @@ struct __copy_backward_impl { template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { - using _Traits = __segmented_iterator_traits<_InIter>; - auto __sfirst = _Traits::__segment(__first); - auto __slast = _Traits::__segment(__last); - if (__sfirst == __slast) { - auto __iters = - std::__copy_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__local(__last), std::move(__result)); - return std::make_pair(__last, __iters.second); - } - - __result = - std::__copy_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__local(__last), std::move(__result)) - .second; - --__slast; - while (__sfirst != __slast) { - __result = - std::__copy_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__end(__slast), std::move(__result)) - .second; - --__slast; - } - __result = std::__copy_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__end(__slast), std::move(__result)) - .second; + using __local_iterator = typename __segmented_iterator_traits<_InIter>::__local_iterator; + std::__for_each_segment_backward(__first, __last, [&__result](__local_iterator __lfirst, __local_iterator __llast) { + __result = std::__copy_backward<_AlgPolicy>(std::move(__lfirst), std::move(__llast), std::move(__result)).second; + }); return std::make_pair(__last, std::move(__result)); } diff --git a/libcxx/include/__algorithm/for_each_segment.h b/libcxx/include/__algorithm/for_each_segment.h index 93aa8259b2f7f..c02436c9aa33c 100644 --- a/libcxx/include/__algorithm/for_each_segment.h +++ b/libcxx/include/__algorithm/for_each_segment.h @@ -48,6 +48,32 @@ __for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Funct __func(_Traits::__begin(__sfirst), _Traits::__local(__last)); } +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +__for_each_segment_backward(_SegmentedIterator __first, _SegmentedIterator __last, _Functor __func) { + using _Traits = __segmented_iterator_traits<_SegmentedIterator>; + + auto __sfirst = _Traits::__segment(__first); + auto __slast = _Traits::__segment(__last); + + // We are in a single segment, so we might not be at the beginning or end + if (__sfirst == __slast) { + __func(_Traits::__local(__first), _Traits::__local(__last)); + return; + } + + // We have more than one segment. Iterate over the last segment, since we might not start at the end + __func(_Traits::__begin(__slast), _Traits::__local(__last)); + --__slast; + // iterate over the segments which are guaranteed to be completely in the range + while (__sfirst != __slast) { + __func(_Traits::__begin(__slast), _Traits::__end(__slast)); + --__slast; + } + // iterate over the first segment + __func(_Traits::__local(__first), _Traits::__end(__slast)); +} + _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___ALGORITHM_FOR_EACH_SEGMENT_H diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h index a4698327b474d..43b72057a5eca 100644 --- a/libcxx/include/__algorithm/move_backward.h +++ b/libcxx/include/__algorithm/move_backward.h @@ -11,6 +11,7 @@ #include <__algorithm/copy_backward.h> #include <__algorithm/copy_move_common.h> +#include <__algorithm/for_each_segment.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> @@ -54,27 +55,10 @@ struct __move_backward_impl { template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { - using _Traits = __segmented_iterator_traits<_InIter>; - auto __sfirst = _Traits::__segment(__first); - auto __slast = _Traits::__segment(__last); - if (__sfirst == __slast) { - auto __iters = - std::__move_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__local(__last), std::move(__result)); - return std::make_pair(__last, __iters.second); - } - - __result = - std::__move_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__local(__last), std::move(__result)) - .second; - --__slast; - while (__sfirst != __slast) { - __result = - std::__move_backward<_AlgPolicy>(_Traits::__begin(__slast), _Traits::__end(__slast), std::move(__result)) - .second; - --__slast; - } - __result = std::__move_backward<_AlgPolicy>(_Traits::__local(__first), _Traits::__end(__slast), std::move(__result)) - .second; + using __local_iterator = typename __segmented_iterator_traits<_InIter>::__local_iterator; + std::__for_each_segment_backward(__first, __last, [&__result](__local_iterator __lfirst, __local_iterator __llast) { + __result = std::__move_backward<_AlgPolicy>(std::move(__lfirst), std::move(__llast), std::move(__result)).second; + }); return std::make_pair(__last, std::move(__result)); } From 86f8445293934b1730b8024ffacf14f20387a0fc Mon Sep 17 00:00:00 2001 From: Kashika Akhouri Date: Thu, 11 Dec 2025 15:23:01 +0530 Subject: [PATCH 39/49] [LifetimeSafety] Infer [[clang::lifetimebound]] annotation (#171081) Adding Annotation Inference in Lifetime Analysis. This PR implicitly adds lifetime bound annotations to the AST which is then used by functions which are parsed later to detect UARs etc. Example: ```cpp std::string_view f1(std::string_view a) { return a; } std::string_view f2(std::string_view a) { return f1(a); } std::string_view ff(std::string_view a) { std::string stack = "something on stack"; return f2(stack); // warning: address of stack memory is returned } ``` Note: 1. We only add lifetime bound annotations to the functions being analyzed currently. 2. Currently, both annotation suggestion and inference work simultaneously. This can be modified based on requirements. 3. The current approach works given that functions are already present in the correct order (callee-before-caller). For not so ideal cases, we can create a CallGraph prior to calling the analysis. This can be done in the next PR. --- clang/include/clang/Basic/LangOptions.def | 2 + clang/include/clang/Options/Options.td | 8 ++ clang/lib/Analysis/LifetimeSafety/Checker.cpp | 22 ++++- .../Sema/warn-lifetime-safety-suggestions.cpp | 94 ++++++++++++++++++- 4 files changed, 124 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 093d2709e59f9..891c9c05afc87 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -501,6 +501,8 @@ LANGOPT(BoundsSafety, 1, 0, NotCompatible, "Bounds safety extension for C") LANGOPT(EnableLifetimeSafety, 1, 0, NotCompatible, "Experimental lifetime safety analysis for C++") +LANGOPT(EnableLifetimeSafetyInference, 1, 0, NotCompatible, "Experimental lifetime safety inference analysis for C++") + LANGOPT(PreserveVec3Type, 1, 0, NotCompatible, "Preserve 3-component vector type") #undef LANGOPT diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index e704d9e6275ec..24b63438e22fc 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -1964,6 +1964,14 @@ defm lifetime_safety : BoolFOption< BothFlags<[], [CC1Option], " experimental lifetime safety for C++">>; +defm lifetime_safety_inference + : BoolFOption<"experimental-lifetime-safety-inference", + LangOpts<"EnableLifetimeSafetyInference">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], [CC1Option], + " experimental lifetime safety inference for C++">>; + defm addrsig : BoolFOption<"addrsig", CodeGenOpts<"Addrsig">, DefaultFalse, PosFlag, diff --git a/clang/lib/Analysis/LifetimeSafety/Checker.cpp b/clang/lib/Analysis/LifetimeSafety/Checker.cpp index 74792768e2c57..99071d6b46c1e 100644 --- a/clang/lib/Analysis/LifetimeSafety/Checker.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Checker.cpp @@ -55,13 +55,14 @@ class LifetimeChecker { const LiveOriginsAnalysis &LiveOrigins; const FactManager &FactMgr; LifetimeSafetyReporter *Reporter; + ASTContext &AST; public: LifetimeChecker(const LoanPropagationAnalysis &LoanPropagation, const LiveOriginsAnalysis &LiveOrigins, const FactManager &FM, AnalysisDeclContext &ADC, LifetimeSafetyReporter *Reporter) : LoanPropagation(LoanPropagation), LiveOrigins(LiveOrigins), FactMgr(FM), - Reporter(Reporter) { + Reporter(Reporter), AST(ADC.getASTContext()) { for (const CFGBlock *B : *ADC.getAnalysis()) for (const Fact *F : FactMgr.getFacts(B)) if (const auto *EF = F->getAs()) @@ -70,6 +71,11 @@ class LifetimeChecker { checkAnnotations(OEF); issuePendingWarnings(); suggestAnnotations(); + // Annotation inference is currently guarded by a frontend flag. In the + // future, this might be replaced by a design that differentiates between + // explicit and inferred findings with separate warning groups. + if (AST.getLangOpts().EnableLifetimeSafetyInference) + inferAnnotations(); } /// Checks if an escaping origin holds a placeholder loan, indicating a @@ -160,6 +166,20 @@ class LifetimeChecker { for (const auto &[PVD, EscapeExpr] : AnnotationWarningsMap) Reporter->suggestAnnotation(PVD, EscapeExpr); } + + void inferAnnotations() { + // FIXME: To maximise inference propagation, functions should be analyzed in + // post-order of the call graph, allowing inferred annotations to propagate + // through the call chain + // FIXME: Add the inferred attribute to all redeclarations of the function, + // not just the definition being analyzed. + for (const auto &[ConstPVD, EscapeExpr] : AnnotationWarningsMap) { + ParmVarDecl *PVD = const_cast(ConstPVD); + if (!PVD->hasAttr()) + PVD->addAttr( + LifetimeBoundAttr::CreateImplicit(AST, PVD->getLocation())); + } + } }; } // namespace diff --git a/clang/test/Sema/warn-lifetime-safety-suggestions.cpp b/clang/test/Sema/warn-lifetime-safety-suggestions.cpp index c0f675a301d14..9f3ccb7fca770 100644 --- a/clang/test/Sema/warn-lifetime-safety-suggestions.cpp +++ b/clang/test/Sema/warn-lifetime-safety-suggestions.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -Wexperimental-lifetime-safety-suggestions -verify %s +// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -fexperimental-lifetime-safety-inference -Wexperimental-lifetime-safety-suggestions -Wexperimental-lifetime-safety -verify %s struct MyObj { int id; @@ -89,6 +89,98 @@ void test_getView_on_temporary() { (void)sv; } +//===----------------------------------------------------------------------===// +// Annotation Inference Test Cases +//===----------------------------------------------------------------------===// + +namespace correct_order_inference { +View return_view_by_func (View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return return_view_directly(a); // expected-note {{param returned here}} +} + +MyObj* return_pointer_by_func (MyObj* a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return return_pointer_object(a); // expected-note {{param returned here}} +} +} // namespace correct_order_inference + +namespace incorrect_order_inference_view { +View return_view_callee(View a); + +// FIXME: No lifetime annotation suggestion when functions are not present in the callee-before-caller pattern +View return_view_caller(View a) { + return return_view_callee(a); +} + +View return_view_callee(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return a; // expected-note {{param returned here}} +} +} // namespace incorrect_order_inference_view + +namespace incorrect_order_inference_object { +MyObj* return_object_callee(MyObj* a); + +// FIXME: No lifetime annotation suggestion warning when functions are not present in the callee-before-caller pattern +MyObj* return_object_caller(MyObj* a) { + return return_object_callee(a); +} + +MyObj* return_object_callee(MyObj* a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return a; // expected-note {{param returned here}} +} +} // namespace incorrect_order_inference_object + +namespace simple_annotation_inference { +View inference_callee_return_identity(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return a; // expected-note {{param returned here}} +} + +View inference_caller_forwards_callee(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return inference_callee_return_identity(a); // expected-note {{param returned here}} +} + +View inference_top_level_return_stack_view() { + MyObj local_stack; + return inference_caller_forwards_callee(local_stack); // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} +} // namespace simple_annotation_inference + +namespace inference_in_order_with_redecls { +View inference_callee_return_identity(View a); +View inference_callee_return_identity(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return a; // expected-note {{param returned here}} +} + +View inference_caller_forwards_callee(View a); +View inference_caller_forwards_callee(View a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return inference_callee_return_identity(a); // expected-note {{param returned here}} +} + +View inference_top_level_return_stack_view() { + MyObj local_stack; + return inference_caller_forwards_callee(local_stack); // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} +} // namespace inference_in_order_with_redecls + +namespace inference_with_templates { +template +T* template_identity(T* a) { // expected-warning {{param should be marked [[clang::lifetimebound]]}}. + return a; // expected-note {{param returned here}} +} + +template +T* template_caller(T* a) { + return template_identity(a); // expected-note {{in instantiation of function template specialization 'inference_with_templates::template_identity' requested here}} +} + +// FIXME: Fails to detect UAR as template instantiations are deferred to the end of the Translation Unit. +MyObj* test_template_inference_with_stack() { + MyObj local_stack; + return template_caller(&local_stack); // expected-note {{in instantiation of function template specialization 'inference_with_templates::template_caller' requested here}} +} +} // namespace inference_with_templates + //===----------------------------------------------------------------------===// // Negative Test Cases //===----------------------------------------------------------------------===// From 33fcfb37dad3dba7307186d0adeb58e2d825cce8 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 11 Dec 2025 10:03:43 +0000 Subject: [PATCH 40/49] [flang][TBAA] refine TARGET/POINTER encoding (#170908) Depends upon https://github.com/llvm/llvm-project/pull/170900 Re-land https://github.com/llvm/llvm-project/pull/169544 Previously we were less specific for POINTER/TARGET: encoding that they could alias with (almost) anything. In the new system, the "target data" tree is now a sibling of the other trees (e.g. "global data"). POITNTER variables go at the root of the "target data" tree, whereas TARGET variables get their own nodes under that tree. For example, ``` integer, pointer :: ip real, pointer :: rp integer, target :: it integer, target :: it2(:) real, target :: rt integer :: i real :: r ``` - `ip` and `rp` may alias with any variable except `i` and `r`. - `it`, `it2`, and `rt` may alias only with `ip` or `rp`. - `i` and `r` cannot alias with any other variable. Fortran 2023 15.5.2.14 gives restrictions on entities associated with dummy arguments. These do not allow non-target globals to be modified through dummy arguments and therefore I don't think we need to make all globals alias with dummy arguments. I haven't implemented it in this patch, but I wonder whether it is ever possible for `ip` to alias with `rt`. While I was updating the tests I fixed up some tests that still assumed that local alloc tbaa wasn't the default. Cray pointers/pointees are (optionally) modelled as aliasing with all non-descriptor data. This is not enabled by default. I found no functional regressions in the gfortran test suite. --- .../flang/Optimizer/Analysis/TBAAForest.h | 24 +++-- flang/lib/Optimizer/Analysis/TBAAForest.cpp | 9 +- .../lib/Optimizer/Transforms/AddAliasTags.cpp | 59 +++++++++--- flang/test/Driver/tco-test-gen.fir | 8 +- flang/test/Fir/tbaa-codegen2.fir | 1 - flang/test/Transforms/tbaa-cray-pointer.fir | 43 +++++++++ .../test/Transforms/tbaa-for-common-vars.fir | 78 ++++++++++++---- .../Transforms/tbaa-for-global-equiv-vars.fir | 6 +- flang/test/Transforms/tbaa-for-local-vars.fir | 32 ++++--- .../test/Transforms/tbaa-with-dummy-scope.fir | 22 +++-- .../Transforms/tbaa-with-dummy-scope2.fir | 32 ++++--- flang/test/Transforms/tbaa2.fir | 20 ++--- flang/test/Transforms/tbaa3.fir | 89 +++++++++---------- flang/test/Transforms/tbaa4.fir | 32 +++---- 14 files changed, 285 insertions(+), 170 deletions(-) create mode 100644 flang/test/Transforms/tbaa-cray-pointer.fir diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h index b4932594114a1..0b70778eba3af 100644 --- a/flang/include/flang/Optimizer/Analysis/TBAAForest.h +++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h @@ -99,11 +99,25 @@ struct TBAATree { // |- "any data access" // | // |- "dummy arg data" - // |- "target data" - // | - // |- "allocated data" - // |- "direct data" - // |- "global data" + // | + // |- + // |- + // |- "target data" <-- Any POINTER variable or TARGET dummy arg + // | + // |- <--- any TARGET variable which isn't a dummy arg + // |- + // |- "allocated data" + // | + // |- + // |- + // |- "direct data" + // | + // |- + // |- + // |- "global data" + // | + // |- + // |- static TBAATree buildTree(mlir::StringAttr functionName); private: diff --git a/flang/lib/Optimizer/Analysis/TBAAForest.cpp b/flang/lib/Optimizer/Analysis/TBAAForest.cpp index 44a0348da3a6f..7154785c62c75 100644 --- a/flang/lib/Optimizer/Analysis/TBAAForest.cpp +++ b/flang/lib/Optimizer/Analysis/TBAAForest.cpp @@ -66,12 +66,9 @@ fir::TBAATree::TBAATree(mlir::LLVM::TBAATypeDescriptorAttr anyAccess, mlir::LLVM::TBAATypeDescriptorAttr dataRoot, mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc) : targetDataTree(dataRoot.getContext(), "target data", dataRoot), - globalDataTree(dataRoot.getContext(), "global data", - targetDataTree.getRoot()), - allocatedDataTree(dataRoot.getContext(), "allocated data", - targetDataTree.getRoot()), + globalDataTree(dataRoot.getContext(), "global data", dataRoot), + allocatedDataTree(dataRoot.getContext(), "allocated data", dataRoot), dummyArgDataTree(dataRoot.getContext(), "dummy arg data", dataRoot), - directDataTree(dataRoot.getContext(), "direct data", - targetDataTree.getRoot()), + directDataTree(dataRoot.getContext(), "direct data", dataRoot), anyAccessDesc(anyAccess), boxMemberTypeDesc(boxMemberTypeDesc), anyDataTypeDesc(dataRoot) {} diff --git a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp index 0221c7a8184d7..558ffa1a80bcf 100644 --- a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp +++ b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp @@ -60,6 +60,9 @@ static llvm::cl::opt localAllocsThreshold( llvm::cl::desc("If present, stops generating TBAA tags for accesses of " "local allocations after N accesses in a module")); +// Defined in AliasAnalysis.cpp +extern llvm::cl::opt supportCrayPointers; + namespace { // Return the size and alignment (in bytes) for the given type. @@ -668,6 +671,7 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op, LLVM_DEBUG(llvm::dbgs() << "Analysing " << op << "\n"); const fir::AliasAnalysis::Source &source = state.getSource(memref); + LLVM_DEBUG(llvm::dbgs() << "Got source " << source << "\n"); // Process the scopes, if not processed yet. state.processFunctionScopes(func); @@ -686,14 +690,22 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op, } mlir::LLVM::TBAATagAttr tag; - // TBAA for dummy arguments - if (enableDummyArgs && - source.kind == fir::AliasAnalysis::SourceKind::Argument) { + // Cray pointer/pointee is a special case. These might alias with any data. + if (supportCrayPointers && source.isCrayPointerOrPointee()) { + LLVM_DEBUG(llvm::dbgs().indent(2) + << "Found reference to Cray pointer/pointee at " << *op << "\n"); + mlir::LLVM::TBAATypeDescriptorAttr anyDataDesc = + state.getFuncTreeWithScope(func, scopeOp).anyDataTypeDesc; + tag = mlir::LLVM::TBAATagAttr::get(anyDataDesc, anyDataDesc, /*offset=*/0); + // TBAA for dummy arguments + } else if (enableDummyArgs && + source.kind == fir::AliasAnalysis::SourceKind::Argument) { LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to dummy argument at " << *op << "\n"); std::string name = getFuncArgName(llvm::cast(source.origin.u)); - // If it is a TARGET or POINTER, then we do not care about the name, - // because the tag points to the root of the subtree currently. + // POINTERS can alias with any POINTER or TARGET. Assume that TARGET dummy + // arguments might alias with each other (because of the "TARGET" hole for + // dummy arguments). See flang/docs/Aliasing.md. if (source.isTargetOrPointer()) { tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag(); } else if (!name.empty()) { @@ -715,13 +727,10 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op, LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to global " << globalName.str() << " at " << *op << "\n"); - if (source.isPointer()) { - tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag(); - } else { - // In general, place the tags under the "global data" root. - fir::TBAATree::SubtreeState *subTree = - &state.getMutableFuncTreeWithScope(func, scopeOp).globalDataTree; + // Add a named tag inside the given subtree, disambiguating members of a + // common block + auto addTagUsingStorageDesc = [&](fir::TBAATree::SubtreeState *subTree) { mlir::Operation *instantiationPoint = source.origin.instantiationPoint; auto storageIface = mlir::dyn_cast_or_null( @@ -766,6 +775,19 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op, LLVM_DEBUG(llvm::dbgs() << "Tagged under '" << globalName << "' root\n"); } + }; + + if (source.isPointer()) { + // Pointers can alias with any pointer or target. + tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag(); + } else if (source.isTarget()) { + // Targets could alias with any pointer but not with each other. + addTagUsingStorageDesc( + &state.getMutableFuncTreeWithScope(func, scopeOp).targetDataTree); + } else { + // In general, place the tags under the "global data" root. + addTagUsingStorageDesc( + &state.getMutableFuncTreeWithScope(func, scopeOp).globalDataTree); } // TBAA for global variables with descriptors @@ -776,9 +798,17 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op, const char *name = glbl.getRootReference().data(); LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to direct " << name << " at " << *op << "\n"); + // Pointer can alias with any pointer or target so that gets the root. if (source.isPointer()) tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag(); + // Targets could alias with any pointer but not with each other so they + // get their own node inside of the target data tree. + else if (source.isTarget()) + tag = state.getFuncTreeWithScope(func, scopeOp) + .targetDataTree.getTag(name); else + // Boxes that are not pointers or targets cannot alias with those that + // are. Put them under global data. tag = state.getFuncTreeWithScope(func, scopeOp) .directDataTree.getTag(name); } else { @@ -815,8 +845,13 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op, << "\n"); } else if (source.isPointer() && state.attachLocalAllocTag()) { LLVM_DEBUG(llvm::dbgs().indent(2) - << "Found reference to allocation at " << *op << "\n"); + << "Found reference to POINTER allocation at " << *op << "\n"); tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag(); + } else if (source.isTarget() && state.attachLocalAllocTag()) { + LLVM_DEBUG(llvm::dbgs().indent(2) + << "Found reference to TARGET allocation at " << *op << "\n"); + tag = state.getFuncTreeWithScope(func, scopeOp) + .targetDataTree.getTag(*name); } else if (name && state.attachLocalAllocTag()) { LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to allocation " << name << " at " << *op << "\n"); diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir index b39295d72918f..438804ce42b76 100644 --- a/flang/test/Driver/tco-test-gen.fir +++ b/flang/test/Driver/tco-test-gen.fir @@ -77,13 +77,13 @@ func.func @_QPtest(%arg0: !fir.ref {fir.bindc_name = "num"}, %arg1: !fir.re // CHECK: llvm.cond_br %[[VAL_17]], ^bb2, ^bb3 // CHECK: ^bb2: -// AA: llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr +// AA: llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr // NOAA: llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr // AA: %[[VAL_18:.*]] = llvm.load %[[ARG0]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32 // NOAA: %[[VAL_18:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32 -// AA: %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32 +// AA: %[[VAL_19:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32 // NOAA: %[[VAL_19:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32 // CHECK: %[[VAL_20:.*]] = llvm.add %[[VAL_18]], %[[VAL_19]] : i32 @@ -92,7 +92,7 @@ func.func @_QPtest(%arg0: !fir.ref {fir.bindc_name = "num"}, %arg1: !fir.re // CHECK: %[[VAL_21:.*]] = llvm.trunc %[[VAL_10]] : i64 to i32 -// AA: %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32 +// AA: %[[VAL_22:.*]] = llvm.load %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : !llvm.ptr -> i32 // NOAA: %[[VAL_22:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32 // CHECK: %[[VAL_23:.*]] = llvm.add %[[VAL_22]], %[[VAL_21]] overflow : i32 @@ -100,7 +100,7 @@ func.func @_QPtest(%arg0: !fir.ref {fir.bindc_name = "num"}, %arg1: !fir.re // CHECK: llvm.br ^bb1(%[[VAL_23]], %[[VAL_24]] : i32, i64) // CHECK: ^bb3: -// AA: llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr +// AA: llvm.store %[[VAL_15]], %[[VAL_1]] {tbaa = [#llvm.tbaa_tag, 0>}>, 0>}>, 0>}>, 0>}>, access_type = , 0>}>, 0>}>, 0>}>, 0>}>, offset = 0>]} : i32, !llvm.ptr // NOAA: llvm.store %[[VAL_15]], %{{.*}} : i32, !llvm.ptr // CHECK: llvm.return diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir index 4907aa03ec5a5..071d3ec89394c 100644 --- a/flang/test/Fir/tbaa-codegen2.fir +++ b/flang/test/Fir/tbaa-codegen2.fir @@ -114,4 +114,3 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ // CHECK: ![[TMP_DATA_ACCESS_TAG]] = !{![[TMP_DATA_ACCESS_TYPE:.*]], ![[TMP_DATA_ACCESS_TYPE]], i64 0} // CHECK: ![[TMP_DATA_ACCESS_TYPE]] = !{!"allocated data/", ![[TMP_ACCESS_TYPE:.*]], i64 0} // CHECK: ![[TMP_ACCESS_TYPE]] = !{!"allocated data", ![[TARGET_ACCESS_TAG:.*]], i64 0} -// CHECK: ![[TARGET_ACCESS_TAG]] = !{!"target data", ![[DATA_ACCESS_TYPE]], i64 0} diff --git a/flang/test/Transforms/tbaa-cray-pointer.fir b/flang/test/Transforms/tbaa-cray-pointer.fir new file mode 100644 index 0000000000000..54406271aaa58 --- /dev/null +++ b/flang/test/Transforms/tbaa-cray-pointer.fir @@ -0,0 +1,43 @@ +// RUN: fir-opt -funsafe-cray-pointers --fir-add-alias-tags %s | FileCheck %s + +// Fortran source: +// subroutine test() +// real :: a, b +// pointer(p, a) +// p = loc(b) +// b = 2 +// end subroutine + +// CHECK: #[[TBAA_ROOT:.*]] = #llvm.tbaa_root +// CHECK-NEXT: #[[ANY_ACCESS:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[ANY_DATA:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[ANY_DATA_TAG:.*]] = #llvm.tbaa_tag +// CHECK-NEXT: #[[ALLOCATED_DATA:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[B:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[B_TAG:.*]] = #llvm.tbaa_tag + +module attributes {dlti.dl_spec = #dlti.dl_spec = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i8 = dense<[8, 32]> : vector<2xi64>, i16 = dense<[16, 32]> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array, "dlti.stack_alignment" = 128 : i64, "dlti.function_pointer_alignment" = #dlti.function_pointer_alignment<32, function_dependent = true>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"} { +// CHECK-LABEL: func.func @_QPtest() + func.func @_QPtest() { + %cst = arith.constant 2.000000e+00 : f32 + %0 = fir.alloca !fir.box> + %1 = fir.dummy_scope : !fir.dscope + %2 = fir.alloca i64 {bindc_name = "p", uniq_name = "_QFtestEp"} + %3 = fir.declare %2 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEp"} : (!fir.ref) -> !fir.ref + %4 = fir.alloca f32 {bindc_name = "b", uniq_name = "_QFtestEb"} + %5 = fir.declare %4 {uniq_name = "_QFtestEb"} : (!fir.ref) -> !fir.ref + %6 = fir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEa"} : (!fir.ref>>) -> !fir.ref>> + %7 = fir.zero_bits !fir.ptr + %8 = fir.embox %7 : (!fir.ptr) -> !fir.box> + fir.store %8 to %6 : !fir.ref>> +// Descriptor tagged in codegen +// CHECK: fir.store %{{.*}} to %{{.*}} : !fir.ref> + %9 = fir.convert %5 : (!fir.ref) -> i64 + fir.store %9 to %3 : !fir.ref +// CHECK: fir.store {{.*}} to {{.*}} {tbaa = [#[[ANY_DATA_TAG]]]} : !fir.ref + fir.store %cst to %5 : !fir.ref +// CHECK: fir.store {{.*}} to {{.*}} {tbaa = [#[[B_TAG]]]} : !fir.ref + return + } +} + diff --git a/flang/test/Transforms/tbaa-for-common-vars.fir b/flang/test/Transforms/tbaa-for-common-vars.fir index a8dd86bff72ed..992658ee2387f 100644 --- a/flang/test/Transforms/tbaa-for-common-vars.fir +++ b/flang/test/Transforms/tbaa-for-common-vars.fir @@ -28,8 +28,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc}> @@ -66,8 +65,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_16:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_18:.+]] = #llvm.tbaa_tag @@ -118,14 +116,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[ANYACC3INNER:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[ANYDATA3:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[ANYDATA3INNER:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TARGETDATA3:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[DUMMYARG3INNER:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[GLOBALDATA3:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[DUMMYD:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[DUMMYC:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[DUMMYDTAG:.+]] = #llvm.tbaa_tag // CHECK: #[[DUMMYCTAG:.+]] = #llvm.tbaa_tag -// CHECK: #[[GLOBALDATA3COMMON3:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOBALB:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOBALA:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOBALBTAG:.+]] = #llvm.tbaa_tag @@ -180,10 +177,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[INNER4ANYACC:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TEST4ANYDATA:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[INNER4ANYDATA:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TEST4TARGET:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[INNER4TARGET:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[TEST4GLOBAL:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[INNER4GLOBAL:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TEST4COMMON:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[INNER4COMMON:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TEST4B:.+]] = #llvm.tbaa_type_desc}> @@ -229,8 +224,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[TEST5ROOT:.+]] = #llvm.tbaa_root // CHECK: #[[TEST5ANYACC:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TEST5ANYDATA:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TEST5TARGET:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[TEST5GLOBAL:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TEST5COMMON5:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TEST5COMMON5TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[TEST5A:.+]] = #llvm.tbaa_type_desc}> @@ -288,8 +282,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag @@ -354,8 +347,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[$ATTR_74:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_75:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_76:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_77:.+]] = #llvm.tbaa_tag -// CHECK: #[[$ATTR_78:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_79:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_80:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_81:.+]] = #llvm.tbaa_tag @@ -425,12 +418,61 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[$ATTR_82:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_83:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_84:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_85:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag -// CHECK: #[[$ATTR_87:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_88:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ATTR_86:.+]] = #llvm.tbaa_tag // CHECK-LABEL: func.func @_QPtest8() { // CHECK: fir.load %{{[0-9]+}} : !fir.ref>> // CHECK: fir.load %{{[0-9]+}} {tbaa = [#[[$ATTR_86]]]} : !fir.ptr // CHECK: fir.load %{{[0-9]+}} : !fir.ref // CHECK: fir.store %{{[0-9]+}} to %{{[0-9]+}} : !fir.ref + +// ----- + +// Fortran source: +// subroutine target_comon_tbaa() +// real :: a +// real, target :: b, c +// common /common1/ a,b,c +// a = b +// end subroutine +// +// Test generation of tbaa tags where some members of a common block are TARGET +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} { fir.global common @block_(dense<0> : vector<44xi8>) {alignment = 4 : i64} : !fir.array<44xi8> + fir.global common @common1_(dense<0> : vector<12xi8>) {alignment = 4 : i64} : !fir.array<12xi8> + func.func @_QPtarget_common_tbaa() { + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.address_of(@common1_) : !fir.ref> + %2 = fir.coordinate_of %1, %c0 : (!fir.ref>, index) -> !fir.ref + %3 = fir.convert %2 : (!fir.ref) -> !fir.ref + %4 = fir.declare %3 storage(%1[0]) {uniq_name = "_QFtarget_comon_tbaaEa"} : (!fir.ref, !fir.ref>) -> !fir.ref + %5 = fir.coordinate_of %1, %c4 : (!fir.ref>, index) -> !fir.ref + %6 = fir.convert %5 : (!fir.ref) -> !fir.ref + %7 = fir.declare %6 storage(%1[4]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtarget_comon_tbaaEb"} : (!fir.ref, !fir.ref>) -> !fir.ref + %8 = fir.coordinate_of %1, %c8 : (!fir.ref>, index) -> !fir.ref + %9 = fir.convert %8 : (!fir.ref) -> !fir.ref + %10 = fir.declare %9 storage(%1[8]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtarget_comon_tbaaEc"} : (!fir.ref, !fir.ref>) -> !fir.ref + %11 = fir.load %7 : !fir.ref + fir.store %11 to %4 : !fir.ref + return + } +} +// CHECK: #[[TBAA_FUNC_ROOT:.*]] = #llvm.tbaa_root +// CHECK-NEXT: #[[ANY_ACCESS:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[ANY_DATA:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[TARGET_DATA:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[GLOBAL_DATA:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[TARGET_COMMON:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[GLOBAL_COMMON:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[B:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[A:.*]] = #llvm.tbaa_type_desc}> +// CHECK-NEXT: #[[B_TAG:.*]] = #llvm.tbaa_tag +// CHECK-NEXT: #[[A_TAG:.*]] = #llvm.tbaa_tag + +// CHECK-LABEL: func.func @_QPtarget_common_tbaa() +// CHECK: %[[LOAD:.*]] = fir.load %{{.*}} {tbaa = [#[[B_TAG]]]} +// CHECK: fir.store %[[LOAD]] to %{{.*}} {tbaa = [#[[A_TAG]]]} diff --git a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir index dbefa3f8e3f5f..0d082c7504024 100644 --- a/flang/test/Transforms/tbaa-for-global-equiv-vars.fir +++ b/flang/test/Transforms/tbaa-for-global-equiv-vars.fir @@ -30,8 +30,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[ROOT1:.+]] = #llvm.tbaa_root // CHECK: #[[ANYACC1:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[ANYDATA1:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TARGETDATA1:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[GLOBALDATA1:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOB1:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[TAG:.+]] = #llvm.tbaa_tag @@ -74,8 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[ROOT2:.+]] = #llvm.tbaa_root // CHECK: #[[ANYACC2:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[ANYDATA2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TARGETDATA2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[GLOBALDATA2:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOB1COMMON:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOB1GLOB2:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLOB3:.+]] = #llvm.tbaa_type_desc}> diff --git a/flang/test/Transforms/tbaa-for-local-vars.fir b/flang/test/Transforms/tbaa-for-local-vars.fir index 4eb6b2ecf31c4..fde5c400c75ed 100644 --- a/flang/test/Transforms/tbaa-for-local-vars.fir +++ b/flang/test/Transforms/tbaa-for-local-vars.fir @@ -35,18 +35,22 @@ // scope's TBAA tree. // RUN: fir-opt --fir-add-alias-tags %s | FileCheck %s -// CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root -// CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root -// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_12:.+]] = #llvm.tbaa_tag -// CHECK: #[[$ATTR_13:.+]] = #llvm.tbaa_tag +// CHECK: #[[$SCOPE_2:.+]] = #llvm.tbaa_root +// CHECK: #[[$SCOPE_1:.+]] = #llvm.tbaa_root +// CHECK: #[[$ANY_ACCESS2:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ANY_ACCESS1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ANY_DATA2:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ANY_DATA1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$DUMMY_ARG2:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ALLOCATED_DATA1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$DUMMY_ARG1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ALLOCATED_DATA1_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[$BAR_THIS2:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$TEST_VAR1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$TEST_ARG1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$BAR_THIS2_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[$TEST_VAR1_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[$TEST_ARG2_TAG:.+]] = #llvm.tbaa_tag // CHECK-LABEL: func.func @_QMmPtest( // CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "arg"}) { @@ -61,10 +65,10 @@ // CHECK: %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope // CHECK: %[[VAL_11:.*]] = fir.declare %[[VAL_9]] dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QMmFbarEthis"} : (!fir.class>, !fir.dscope) -> !fir.class> // CHECK: %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_11]], x : (!fir.class>) -> !fir.ref -// CHECK: fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$ATTR_12]]]} : !fir.ref +// CHECK: fir.store %[[VAL_0]] to %[[VAL_12]] {tbaa = [#[[$BAR_THIS2_TAG]]]} : !fir.ref // CHECK: %[[VAL_13:.*]] = fir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref>) -> !fir.ref> // CHECK: %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_13]], x : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ATTR_13]]]} : !fir.ref +// CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_14]] {tbaa = [#[[$ALLOCATED_DATA1_TAG]]]} : !fir.ref module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} { func.func @_QMmPtest(%arg0: !fir.ref {fir.bindc_name = "arg"}) { %cst = arith.constant 1.000000e+00 : f32 diff --git a/flang/test/Transforms/tbaa-with-dummy-scope.fir b/flang/test/Transforms/tbaa-with-dummy-scope.fir index 4ae2b8efe2581..d7f33776150ae 100644 --- a/flang/test/Transforms/tbaa-with-dummy-scope.fir +++ b/flang/test/Transforms/tbaa-with-dummy-scope.fir @@ -24,7 +24,7 @@ // CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TARGETTAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[TARGETDATA_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_type_desc}> @@ -34,8 +34,8 @@ // CHECK: #[[$ATTR_14:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_15:.+]] = #llvm.tbaa_tag // CHECK: func.func @test1( -// CHECK: %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref -// CHECK: fir.store %{{.*}} {tbaa = [#[[TARGETTAG]]]} : !fir.ref +// CHECK: %[[VAL_5:.*]] = fir.load %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref +// CHECK: fir.store %{{.*}} {tbaa = [#[[TARGETDATA_TAG]]]} : !fir.ref // CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope // CHECK: %[[VAL_9:.*]] = fir.load %{{.*}} {tbaa = [#[[$ATTR_12]]]} : !fir.ref // CHECK: fir.store %{{.*}} {tbaa = [#[[$ATTR_13]]]} : !fir.ref @@ -83,23 +83,21 @@ func.func @test1(%arg0: !fir.ref {fir.bindc_name = "x", fir.target}, %arg1: // CHECK: #[[$ATTR_33:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_34:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_35:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_36:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_37:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[CALLERTARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[CALLEETARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag +// CHECK: #[[$CALLERANYDATA:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$CALLEEANYDATA:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_38:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_39:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_40:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_41:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_42:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_43:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_44:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_45:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_46:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_47:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_48:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_49:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ATTR_50:.+]] = #llvm.tbaa_tag // CHECK: func.func @_QMtestPcaller( // CHECK-SAME: %[[VAL_0:.*]]: !fir.ref {fir.bindc_name = "z"}) { // CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope diff --git a/flang/test/Transforms/tbaa-with-dummy-scope2.fir b/flang/test/Transforms/tbaa-with-dummy-scope2.fir index 54902ca7d41e1..6f5ed69fbc9c6 100644 --- a/flang/test/Transforms/tbaa-with-dummy-scope2.fir +++ b/flang/test/Transforms/tbaa-with-dummy-scope2.fir @@ -44,16 +44,15 @@ func.func @_QPtest1() attributes {noinline} { } // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$TARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ANYDATA:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_tag // CHECK-LABEL: func.func @_QPtest1() attributes {noinline} { // CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest1FinnerEy"} @@ -90,19 +89,18 @@ func.func @_QPtest2() attributes {noinline} { } // CHECK: #[[$ATTR_0:.+]] = #llvm.tbaa_root // CHECK: #[[$ATTR_1:.+]] = #llvm.tbaa_root -// CHECK: #[[$ATTR_2:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_3:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_4:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_5:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$TARGETDATA_0:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ANY_ACCESS_0:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ANY_ACCESS_1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ANY_DATA_0:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ANY_DATA_1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$LOCAL_ATTR_0:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_6:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_7:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$LOCAL_ATTR_1:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[$ATTR_8:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$ATTR_9:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[$LOCAL_ATTR_2:.+]] = #llvm.tbaa_tag +// CHECK: #[[$ATTR_10:.+]] = #llvm.tbaa_tag // CHECK: #[[$ATTR_11:.+]] = #llvm.tbaa_tag // CHECK-LABEL: func.func @_QPtest2() attributes {noinline} { // CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtest2FinnerEy"} diff --git a/flang/test/Transforms/tbaa2.fir b/flang/test/Transforms/tbaa2.fir index a594e6b32fdac..9b5307ba69d17 100644 --- a/flang/test/Transforms/tbaa2.fir +++ b/flang/test/Transforms/tbaa2.fir @@ -48,18 +48,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[ROOT:.+]] = #llvm.tbaa_root // CHECK: #[[ANY_ACCESS:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[ANY_DATA:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[ANY_ARG:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[ANY_GLBL:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[ANY_LOCAL:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc}> -// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc}> - -// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag -// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag -// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[ANY_DIRECT:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLBL_ZSTART:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLBL_ZSTOP:.+]] = #llvm.tbaa_type_desc}> @@ -69,10 +61,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[LOCAL2_ALLOC:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLBL_XSTART:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[LOCAL3_ALLOC:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[LOCAL4_ALLOC:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[DIRECT_A:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[DIRECT_B:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLBL_DYINV:.+]] = #llvm.tbaa_type_desc}> +// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[LOCAL5_ALLOC:.+]] = #llvm.tbaa_type_desc}> // CHECK: #[[GLBL_ZSTART_TAG:.+]] = #llvm.tbaa_tag @@ -83,10 +78,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // CHECK: #[[LOCAL2_ALLOC_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[GLBL_XSTART_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[LOCAL3_ALLOC_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[LOCAL4_ALLOC_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[DIRECT_A_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[DIRECT_B_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[GLBL_DYINV_TAG:.+]] = #llvm.tbaa_tag +// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag // CHECK: #[[LOCAL5_ALLOC_TAG:.+]] = #llvm.tbaa_tag func.func @_QMmodPcallee(%arg0: !fir.box> {fir.bindc_name = "z"}, %arg1: !fir.box> {fir.bindc_name = "y"}, %arg2: !fir.ref>>> {fir.bindc_name = "low"}) { diff --git a/flang/test/Transforms/tbaa3.fir b/flang/test/Transforms/tbaa3.fir index abcb7e000bac1..79f79cb6ca26b 100644 --- a/flang/test/Transforms/tbaa3.fir +++ b/flang/test/Transforms/tbaa3.fir @@ -1,5 +1,4 @@ -// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL,DEFAULT %s -// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa %s | FileCheck --check-prefixes=ALL,LOCAL %s +// RUN: fir-opt --fir-add-alias-tags %s | FileCheck --check-prefixes=ALL %s // Test AddAliasTagsPass creating sub-tree for TARGET/POINTER variables. @@ -56,56 +55,57 @@ // | |- "dummy arg data/_QFtest1Edummyas" // | |- "dummy arg data/_QFtest1Edummya" // | -// |- "target data" <- all pointers and taget dummys -// | -// |- "global data" -// | | -// | |- "global data/_QMdataEglob" -// | |- "global data/_QMdataEglobt" -// | -// |- "direct data" -// | | -// | |- "direct data/_QMdataEgloba" -// | |- "direct data/_QMdataEglobat" +// |- "target data" <--- all pointers and target dummy arguments go here +// | |- "target data/_QMdataEglobt" +// | |- "target data/_QMdataEglobat" +// | |- "target data/_QFtest1Elocalt" +// | |- "target data/_QFtest1Elocalat" +// | +// |- "global data" +// | | +// | |- "global data/_QMdataEglob" +// | +// |- "direct data" +// | | +// | |- "direct data/_QMdataEgloba" +// | +// |- "allocated data" // | -// |- "allocated data" -// | -// |- "allocated data/_QFtest1Elocal" -// |- "allocated data/_QFtest1Elocalt" -// |- "allocated data/_QFtest1Elocala" -// |- "allocated data/_QFtest1Elocalat" +// |- "allocated data/_QFtest1Elocal" +// |- "allocated data/_QFtest1Elocala" // ALL: #[[FUNCROOT:.+]] = #llvm.tbaa_root // ALL: #[[ANYACCESS:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[DUMMYDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[TARGETTAG:.+]] = #llvm.tbaa_tag -// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[DIRECTDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[DUMMYFVAR:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[DUMMYASVAR:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[DUMMYAVAR:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[LOCALDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag -// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag -// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag -// ALL: #[[GLOBVAR:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[GLOBTVAR:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[GLOBAVAR:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[GLOBATVAR:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[LOCALVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[LOCALTVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[LOCALAVAR:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[LOCALATVAR:.+]] = #llvm.tbaa_type_desc}> + // ALL: #[[GLOBTAG:.+]] = #llvm.tbaa_tag // ALL: #[[GLOBTTAG:.+]] = #llvm.tbaa_tag // ALL: #[[GLOBATAG:.+]] = #llvm.tbaa_tag // ALL: #[[GLOBATTAG:.+]] = #llvm.tbaa_tag -// LOCAL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag -// LOCAL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag -// LOCAL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag -// LOCAL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag +// ALL: #[[DUMMYFTAG:.+]] = #llvm.tbaa_tag +// ALL: #[[DUMMYASTAG:.+]] = #llvm.tbaa_tag +// ALL: #[[DUMMYATAG:.+]] = #llvm.tbaa_tag +// ALL: #[[LOCALTAG:.+]] = #llvm.tbaa_tag +// ALL: #[[LOCALTTAG:.+]] = #llvm.tbaa_tag +// ALL: #[[LOCALATAG:.+]] = #llvm.tbaa_tag +// ALL: #[[LOCALATTAG:.+]] = #llvm.tbaa_tag module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} { fir.global @_QMdataEglob : !fir.array<10xf32> { @@ -263,13 +263,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 fir.store %cst to %67 : !fir.ref %68 = fir.array_coor %20(%5) %c1 : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref // real :: local(10) -// DEFAULT: fir.store{{.*}}tbaa -// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref +// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTAG]]]} : !fir.ref fir.store %cst to %68 : !fir.ref %69 = fir.array_coor %33(%5) %c1 : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref // real, target :: localt(10) -// DEFAULT: fir.store{{.*}}tbaa -// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref +// ALL: fir.store{{.*}}{tbaa = [#[[LOCALTTAG]]]} : !fir.ref fir.store %cst to %69 : !fir.ref // ALL-NOT: fir.load{{.*}}tbaa %70 = fir.load %25 : !fir.ref>>> @@ -278,8 +276,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 %73 = fir.shape_shift %72#0, %72#1 : (index, index) -> !fir.shapeshift<1> %74 = fir.array_coor %71(%73) %c1 : (!fir.heap>, !fir.shapeshift<1>, index) -> !fir.ref // real, allocatable :: locala(:) -// DEFAULT: fir.store{{.*}}tbaa -// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref +// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATAG]]]} : !fir.ref fir.store %cst to %74 : !fir.ref // ALL-NOT: fir.load{{.*}}tbaa %75 = fir.load %27 : !fir.ref>>> @@ -288,8 +285,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 %78 = fir.shape_shift %77#0, %77#1 : (index, index) -> !fir.shapeshift<1> %79 = fir.array_coor %76(%78) %c1 : (!fir.heap>, !fir.shapeshift<1>, index) -> !fir.ref // real, allocatable, target :: localat(:) -// DEFAULT: fir.store{{.*}}tbaa -// LOCAL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref +// ALL: fir.store{{.*}}{tbaa = [#[[LOCALATTAG]]]} : !fir.ref fir.store %cst to %79 : !fir.ref // ALL-NOT: fir.load{{.*}}tbaa %80 = fir.load %31 : !fir.ref>>> @@ -297,8 +293,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 %82 = fir.shift %81#0 : (index) -> !fir.shift<1> %83 = fir.array_coor %80(%82) %c1 : (!fir.box>>, !fir.shift<1>, index) -> !fir.ref // real, pointer :: localp(:) -// DEFAULT: fir.store{{.*}}tbaa -// LOCAL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref +// ALL: fir.store{{.*}}{tbaa = [#[[TARGETTAG]]]} : !fir.ref fir.store %cst to %83 : !fir.ref // ALL-NOT: fir.load{{.*}}tbaa %84 = fir.load %27 : !fir.ref>>> diff --git a/flang/test/Transforms/tbaa4.fir b/flang/test/Transforms/tbaa4.fir index c368a3d06c2ba..5e29014af8935 100644 --- a/flang/test/Transforms/tbaa4.fir +++ b/flang/test/Transforms/tbaa4.fir @@ -1,12 +1,10 @@ // Test TBAA tags for common and equivalence. -// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL,DEFAULT %s -// RUN: fir-opt --fir-add-alias-tags --local-alloc-tbaa --split-input-file %s | FileCheck --check-prefixes=ALL,LOCAL %s +// RUN: fir-opt --fir-add-alias-tags --split-input-file %s | FileCheck --check-prefixes=ALL %s // ALL: #[[ROOT:.+]] = #llvm.tbaa_root // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[BLK:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[BLK_A:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[BLK_C:.+]] = #llvm.tbaa_type_desc}> @@ -54,19 +52,17 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // ----- -// LOCAL: #[[ROOT:.+]] = #llvm.tbaa_root -// LOCAL: #[[ANY:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc}> -// LOCAL: #[[TAG:.+]] = #llvm.tbaa_tag +// ALL: #[[ROOT:.+]] = #llvm.tbaa_root +// ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[ALLOCATEDDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[TAG:.+]] = #llvm.tbaa_tag // ALL-LABEL: func.func @_QPtest_local_equiv() { -// LOCAL: fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr -// LOCAL: fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref -// LOCAL: fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr -// DEFAULT-NOT: fir.store{{.}}tbaa +// ALL: fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr +// ALL: fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ref +// ALL: fir.store{{.*}}{tbaa = [#[[TAG]]]} : !fir.ptr module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} { func.func @_QPtest_local_equiv() { %c1 = arith.constant 1 : index @@ -98,8 +94,7 @@ func.func @_QPtest_local_equiv() { // ALL: #[[ROOT:.+]] = #llvm.tbaa_root // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[TAG:.+]] = #llvm.tbaa_tag module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i64 = dense<[32, 64]> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little">, llvm.data_layout = ""} { @@ -143,8 +138,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<4 // ALL: #[[ROOT:.+]] = #llvm.tbaa_root // ALL: #[[ANY:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[ANYDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[TARGETDATA:.+]] = #llvm.tbaa_type_desc}> -// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> +// ALL: #[[GLOBALDATA:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[EQUIV:.+]] = #llvm.tbaa_type_desc}> // ALL: #[[TAG:.+]] = #llvm.tbaa_tag From c02978867e85a992df70aedd2fa41d7a41932375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Thu, 11 Dec 2025 11:08:20 +0100 Subject: [PATCH 41/49] Revert "[AMDGPU][SDAG] Add missing cases for SI_INDIRECT_SRC/DST (#170323) (#171787) ``` Step 7 (test-check-all) failure: Test just built components: check-all completed (failure) ******************** TEST 'LLVM :: CodeGen/AMDGPU/insert_vector_dynelt.ll' FAILED ******************** Exit Code: 1 Command Output (stdout): -- # RUN: at line 2 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -mtriple=amdgcn -mcpu=fiji < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll # executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -mtriple=amdgcn -mcpu=fiji # executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll # RUN: at line 3 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -O0 -mtriple=amdgcn -mcpu=fiji < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck --check-prefixes=GCN-O0 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll # executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -O0 -mtriple=amdgcn -mcpu=fiji # .---command stderr------------ # | # | # After Instruction Selection # | # Machine code for function insert_dyn_i32_6: IsSSA, TracksLiveness # | Function Live Ins: $sgpr16 in %8, $sgpr17 in %9, $sgpr18 in %10, $sgpr19 in %11, $sgpr20 in %12, $sgpr21 in %13, $vgpr0 in %14, $vgpr1 in %15 # | # | bb.0 (%ir-block.0): # | successors: %bb.1(0x80000000); %bb.1(100.00%) # | liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $vgpr0, $vgpr1 # | %15:vgpr_32 = COPY $vgpr1 # | %14:vgpr_32 = COPY $vgpr0 # | %13:sgpr_32 = COPY $sgpr21 # | %12:sgpr_32 = COPY $sgpr20 # | %11:sgpr_32 = COPY $sgpr19 # | %10:sgpr_32 = COPY $sgpr18 # | %9:sgpr_32 = COPY $sgpr17 # | %8:sgpr_32 = COPY $sgpr16 # | %17:sgpr_192 = REG_SEQUENCE %8:sgpr_32, %subreg.sub0, %9:sgpr_32, %subreg.sub1, %10:sgpr_32, %subreg.sub2, %11:sgpr_32, %subreg.sub3, %12:sgpr_32, %subreg.sub4, %13:sgpr_32, %subreg.sub5 # | %16:sgpr_192 = COPY %17:sgpr_192 # | %19:vreg_192 = COPY %17:sgpr_192 # | %28:sreg_64_xexec = IMPLICIT_DEF # | %27:sreg_64_xexec = S_MOV_B64 $exec # | # | bb.1: # | ; predecessors: %bb.1, %bb.0 # | successors: %bb.1(0x40000000), %bb.3(0x40000000); %bb.1(50.00%), %bb.3(50.00%) # | # | %26:vreg_192 = PHI %19:vreg_192, %bb.0, %18:vreg_192, %bb.1 # | %29:sreg_64 = PHI %28:sreg_64_xexec, %bb.0, %30:sreg_64, %bb.1 # | %31:sreg_32_xm0 = V_READFIRSTLANE_B32 %14:vgpr_32, implicit $exec # | %32:sreg_64 = V_CMP_EQ_U32_e64 %31:sreg_32_xm0, %14:vgpr_32, implicit $exec # | %30:sreg_64 = S_AND_SAVEEXEC_B64 killed %32:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec # | $m0 = COPY killed %31:sreg_32_xm0 # | %18:vreg_192 = V_INDIRECT_REG_WRITE_MOVREL_B32_V8 %26:vreg_192(tied-def 0), %15:vgpr_32, 3, implicit $m0, implicit $exec # | $exec = S_XOR_B64_term $exec, %30:sreg_64, implicit-def $scc # | S_CBRANCH_EXECNZ %bb.1, implicit $exec # | # | bb.3: ``` This reverts commit 15df9e701f1f1194a25e6123612cc735ad392ae4. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 - llvm/lib/Target/AMDGPU/SIInstructions.td | 16 - .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 3310 --------- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 5963 ----------------- 4 files changed, 9297 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 76bbb30b85a78..4651d7d9d3adf 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6304,11 +6304,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: - case AMDGPU::SI_INDIRECT_SRC_V3: case AMDGPU::SI_INDIRECT_SRC_V4: - case AMDGPU::SI_INDIRECT_SRC_V5: - case AMDGPU::SI_INDIRECT_SRC_V6: - case AMDGPU::SI_INDIRECT_SRC_V7: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V9: case AMDGPU::SI_INDIRECT_SRC_V10: @@ -6319,11 +6315,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitIndirectSrc(MI, *BB, *getSubtarget()); case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: - case AMDGPU::SI_INDIRECT_DST_V3: case AMDGPU::SI_INDIRECT_DST_V4: - case AMDGPU::SI_INDIRECT_DST_V5: - case AMDGPU::SI_INDIRECT_DST_V6: - case AMDGPU::SI_INDIRECT_DST_V7: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V9: case AMDGPU::SI_INDIRECT_DST_V10: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 643b2463344e5..984d1a4db4cd6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -969,11 +969,7 @@ class SI_INDIRECT_DST : VPseudoInstSI < def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC; -def SI_INDIRECT_SRC_V3 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC; -def SI_INDIRECT_SRC_V5 : SI_INDIRECT_SRC; -def SI_INDIRECT_SRC_V6 : SI_INDIRECT_SRC; -def SI_INDIRECT_SRC_V7 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC; @@ -984,11 +980,7 @@ def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V3 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V5 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V6 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V7 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST; @@ -2787,11 +2779,7 @@ multiclass SI_INDIRECT_Pattern { } defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern; -defm : SI_INDIRECT_Pattern; -defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; @@ -2801,11 +2789,7 @@ defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern; -defm : SI_INDIRECT_Pattern; -defm : SI_INDIRECT_Pattern; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 4b340f308d5f6..c69b0cce3d208 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s -; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_extelt: @@ -21,30 +20,6 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float4_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 4.0 -; GCN-O0-NEXT: s_mov_b32 s4, 2.0 -; GCN-O0-NEXT: s_mov_b32 s5, 1.0 -; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -68,30 +43,6 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: int4_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 4 -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: s_mov_b32 s5, 1 -; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x i32> , i32 %sel store i32 %ext, ptr addrspace(1) %out @@ -121,72 +72,6 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double4_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x40100a3d -; GCN-O0-NEXT: s_mov_b32 s4, 0x70a3d70a -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s12, s4 -; GCN-O0-NEXT: s_mov_b32 s6, 0x4000147a -; GCN-O0-NEXT: s_mov_b32 s4, 0xe147ae14 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s13, s5 -; GCN-O0-NEXT: s_mov_b32 s14, s4 -; GCN-O0-NEXT: s_mov_b32 s6, 0x3ff028f5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s15, s5 -; GCN-O0-NEXT: s_mov_b32 s16, s4 -; GCN-O0-NEXT: s_mov_b32 s6, 0x3f847ae1 -; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s17, s5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s16 -; GCN-O0-NEXT: s_mov_b32 s7, s15 -; GCN-O0-NEXT: s_mov_b32 s8, s14 -; GCN-O0-NEXT: s_mov_b32 s9, s13 -; GCN-O0-NEXT: s_mov_b32 s10, s12 -; GCN-O0-NEXT: s_mov_b32 s11, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -224,113 +109,6 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double5_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x40140a3d -; GCN-O0-NEXT: s_mov_b32 s4, 0x70a3d70a -; GCN-O0-NEXT: s_mov_b32 s6, s4 -; GCN-O0-NEXT: s_mov_b32 s7, s3 -; GCN-O0-NEXT: s_mov_b32 s25, s7 -; GCN-O0-NEXT: s_mov_b32 s26, s6 -; GCN-O0-NEXT: s_mov_b32 s3, 0x40100a3d -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s3 -; GCN-O0-NEXT: s_mov_b32 s27, s5 -; GCN-O0-NEXT: s_mov_b32 s28, s4 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4000147a -; GCN-O0-NEXT: s_mov_b32 s4, 0xe147ae14 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s3 -; GCN-O0-NEXT: s_mov_b32 s29, s5 -; GCN-O0-NEXT: s_mov_b32 s30, s4 -; GCN-O0-NEXT: s_mov_b32 s3, 0x3ff028f5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s3 -; GCN-O0-NEXT: s_mov_b32 s31, s5 -; GCN-O0-NEXT: s_mov_b32 s33, s4 -; GCN-O0-NEXT: s_mov_b32 s3, 0x3f847ae1 -; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s3 -; GCN-O0-NEXT: s_mov_b32 s34, s5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr24 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr23 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr22 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr21 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr20 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; GCN-O0-NEXT: s_mov_b32 s5, s34 -; GCN-O0-NEXT: s_mov_b32 s6, s33 -; GCN-O0-NEXT: s_mov_b32 s7, s31 -; GCN-O0-NEXT: s_mov_b32 s8, s30 -; GCN-O0-NEXT: s_mov_b32 s9, s29 -; GCN-O0-NEXT: s_mov_b32 s10, s28 -; GCN-O0-NEXT: s_mov_b32 s11, s27 -; GCN-O0-NEXT: s_mov_b32 s12, s26 -; GCN-O0-NEXT: s_mov_b32 s13, s25 -; GCN-O0-NEXT: s_mov_b32 s14, s24 -; GCN-O0-NEXT: s_mov_b32 s15, s23 -; GCN-O0-NEXT: s_mov_b32 s16, s22 -; GCN-O0-NEXT: s_mov_b32 s17, s21 -; GCN-O0-NEXT: s_mov_b32 s18, s20 -; GCN-O0-NEXT: s_mov_b32 s19, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <5 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -352,25 +130,6 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: half4_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s5, 0x44004200 -; GCN-O0-NEXT: s_mov_b32 s0, 0x40003c00 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 -; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_short v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x half> , i32 %sel store half %ext, ptr addrspace(1) %out @@ -390,24 +149,6 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float2_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 1.0 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <2 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -431,48 +172,6 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double2_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x3ff028f5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xc28f5c29 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s6, 0x3f847ae1 -; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s9, s5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s5, s9 -; GCN-O0-NEXT: s_mov_b32 s6, s8 -; GCN-O0-NEXT: s_mov_b32 s7, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <2 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -518,60 +217,6 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: half8_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4000 -; GCN-O0-NEXT: s_mov_b32 s6, 0x3c00 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-O0-NEXT: s_mov_b32 s3, 2 -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4200 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-O0-NEXT: s_mov_b32 s3, 3 -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4400 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-O0-NEXT: s_mov_b32 s3, 4 -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4500 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-O0-NEXT: s_mov_b32 s3, 5 -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4600 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-O0-NEXT: s_mov_b32 s3, 6 -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s3, 0x4700 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-O0-NEXT: s_mov_b32 s3, 7 -; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3 -; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-O0-NEXT: s_mov_b32 s4, 0x4800 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3] -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_short v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x half> , i32 %sel store half %ext, ptr addrspace(1) %out @@ -603,39 +248,6 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: short8_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1 -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 3 -; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 4 -; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 5 -; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 6 -; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 7 -; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1 -; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s0, 8 -; GCN-O0-NEXT: s_cselect_b32 s0, s0, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_short v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x i16> , i32 %sel store i16 %ext, ptr addrspace(1) %out @@ -662,42 +274,6 @@ define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float8_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x41000000 -; GCN-O0-NEXT: s_mov_b32 s4, 0x40e00000 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40c00000 -; GCN-O0-NEXT: s_mov_b32 s6, 0x40a00000 -; GCN-O0-NEXT: s_mov_b32 s7, 4.0 -; GCN-O0-NEXT: s_mov_b32 s8, 0x40400000 -; GCN-O0-NEXT: s_mov_b32 s9, 2.0 -; GCN-O0-NEXT: s_mov_b32 s10, 1.0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s3 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v8 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -749,101 +325,6 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[15:16] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double8_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40200000 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s20, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x401c0000 -; GCN-O0-NEXT: s_mov_b32 s21, s5 -; GCN-O0-NEXT: s_mov_b32 s22, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40180000 -; GCN-O0-NEXT: s_mov_b32 s23, s5 -; GCN-O0-NEXT: s_mov_b32 s24, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40140000 -; GCN-O0-NEXT: s_mov_b32 s25, s5 -; GCN-O0-NEXT: s_mov_b32 s26, s4 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 4.0 -; GCN-O0-NEXT: s_mov_b32 s27, s5 -; GCN-O0-NEXT: s_mov_b32 s28, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40080000 -; GCN-O0-NEXT: s_mov_b32 s29, s5 -; GCN-O0-NEXT: s_mov_b32 s30, s4 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 2.0 -; GCN-O0-NEXT: s_mov_b32 s31, s5 -; GCN-O0-NEXT: s_mov_b32 s33, s4 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s34, s5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; GCN-O0-NEXT: s_mov_b32 s5, s34 -; GCN-O0-NEXT: s_mov_b32 s6, s33 -; GCN-O0-NEXT: s_mov_b32 s7, s31 -; GCN-O0-NEXT: s_mov_b32 s8, s30 -; GCN-O0-NEXT: s_mov_b32 s9, s29 -; GCN-O0-NEXT: s_mov_b32 s10, s28 -; GCN-O0-NEXT: s_mov_b32 s11, s27 -; GCN-O0-NEXT: s_mov_b32 s12, s26 -; GCN-O0-NEXT: s_mov_b32 s13, s25 -; GCN-O0-NEXT: s_mov_b32 s14, s24 -; GCN-O0-NEXT: s_mov_b32 s15, s23 -; GCN-O0-NEXT: s_mov_b32 s16, s22 -; GCN-O0-NEXT: s_mov_b32 s17, s21 -; GCN-O0-NEXT: s_mov_b32 s18, s20 -; GCN-O0-NEXT: s_mov_b32 s19, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -893,101 +374,6 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[15:16] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double7_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x401c0000 -; GCN-O0-NEXT: s_mov_b32 s21, s5 -; GCN-O0-NEXT: s_mov_b32 s22, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40180000 -; GCN-O0-NEXT: s_mov_b32 s23, s5 -; GCN-O0-NEXT: s_mov_b32 s24, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40140000 -; GCN-O0-NEXT: s_mov_b32 s25, s5 -; GCN-O0-NEXT: s_mov_b32 s26, s4 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 4.0 -; GCN-O0-NEXT: s_mov_b32 s27, s5 -; GCN-O0-NEXT: s_mov_b32 s28, s4 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40080000 -; GCN-O0-NEXT: s_mov_b32 s29, s5 -; GCN-O0-NEXT: s_mov_b32 s30, s4 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 2.0 -; GCN-O0-NEXT: s_mov_b32 s31, s5 -; GCN-O0-NEXT: s_mov_b32 s33, s4 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s34, s5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr20 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; GCN-O0-NEXT: s_mov_b32 s5, s34 -; GCN-O0-NEXT: s_mov_b32 s6, s33 -; GCN-O0-NEXT: s_mov_b32 s7, s31 -; GCN-O0-NEXT: s_mov_b32 s8, s30 -; GCN-O0-NEXT: s_mov_b32 s9, s29 -; GCN-O0-NEXT: s_mov_b32 s10, s28 -; GCN-O0-NEXT: s_mov_b32 s11, s27 -; GCN-O0-NEXT: s_mov_b32 s12, s26 -; GCN-O0-NEXT: s_mov_b32 s13, s25 -; GCN-O0-NEXT: s_mov_b32 s14, s24 -; GCN-O0-NEXT: s_mov_b32 s15, s23 -; GCN-O0-NEXT: s_mov_b32 s16, s22 -; GCN-O0-NEXT: s_mov_b32 s17, s21 -; GCN-O0-NEXT: s_mov_b32 s18, s20 -; GCN-O0-NEXT: s_mov_b32 s19, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s19 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s19 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <7 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -1022,66 +408,6 @@ define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float16_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x41800000 -; GCN-O0-NEXT: s_mov_b32 s4, 0x41700000 -; GCN-O0-NEXT: s_mov_b32 s5, 0x41600000 -; GCN-O0-NEXT: s_mov_b32 s6, 0x41500000 -; GCN-O0-NEXT: s_mov_b32 s7, 0x41400000 -; GCN-O0-NEXT: s_mov_b32 s8, 0x41300000 -; GCN-O0-NEXT: s_mov_b32 s9, 0x41200000 -; GCN-O0-NEXT: s_mov_b32 s10, 0x41100000 -; GCN-O0-NEXT: s_mov_b32 s11, 0x41000000 -; GCN-O0-NEXT: s_mov_b32 s12, 0x40e00000 -; GCN-O0-NEXT: s_mov_b32 s13, 0x40c00000 -; GCN-O0-NEXT: s_mov_b32 s14, 0x40a00000 -; GCN-O0-NEXT: s_mov_b32 s15, 4.0 -; GCN-O0-NEXT: s_mov_b32 s16, 0x40400000 -; GCN-O0-NEXT: s_mov_b32 s17, 2.0 -; GCN-O0-NEXT: s_mov_b32 s18, 1.0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s3 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v19 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v16 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <16 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -1163,181 +489,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double15_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: s_mov_b32 s7, 0x402e0000 -; GCN-O0-NEXT: s_mov_b32 s5, s7 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s8, 0 -; GCN-O0-NEXT: s_mov_b32 s9, 0x402c0000 -; GCN-O0-NEXT: s_mov_b32 s7, s9 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s10, 0 -; GCN-O0-NEXT: s_mov_b32 s11, 0x402a0000 -; GCN-O0-NEXT: s_mov_b32 s9, s11 -; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s12, 0 -; GCN-O0-NEXT: s_mov_b32 s13, 0x40280000 -; GCN-O0-NEXT: s_mov_b32 s11, s13 -; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GCN-O0-NEXT: s_mov_b32 s14, 0 -; GCN-O0-NEXT: s_mov_b32 s15, 0x40260000 -; GCN-O0-NEXT: s_mov_b32 s13, s15 -; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 killed $sgpr14_sgpr15 -; GCN-O0-NEXT: s_mov_b32 s16, 0 -; GCN-O0-NEXT: s_mov_b32 s17, 0x40240000 -; GCN-O0-NEXT: s_mov_b32 s15, s17 -; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 -; GCN-O0-NEXT: s_mov_b32 s18, 0 -; GCN-O0-NEXT: s_mov_b32 s19, 0x40220000 -; GCN-O0-NEXT: s_mov_b32 s17, s19 -; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 killed $sgpr18_sgpr19 -; GCN-O0-NEXT: s_mov_b32 s20, 0 -; GCN-O0-NEXT: s_mov_b32 s21, 0x40200000 -; GCN-O0-NEXT: s_mov_b32 s19, s21 -; GCN-O0-NEXT: ; kill: def $sgpr20 killed $sgpr20 killed $sgpr20_sgpr21 -; GCN-O0-NEXT: s_mov_b32 s22, 0 -; GCN-O0-NEXT: s_mov_b32 s23, 0x401c0000 -; GCN-O0-NEXT: s_mov_b32 s21, s23 -; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 killed $sgpr22_sgpr23 -; GCN-O0-NEXT: s_mov_b32 s24, 0 -; GCN-O0-NEXT: s_mov_b32 s25, 0x40180000 -; GCN-O0-NEXT: s_mov_b32 s23, s25 -; GCN-O0-NEXT: ; kill: def $sgpr24 killed $sgpr24 killed $sgpr24_sgpr25 -; GCN-O0-NEXT: s_mov_b32 s26, 0 -; GCN-O0-NEXT: s_mov_b32 s27, 0x40140000 -; GCN-O0-NEXT: s_mov_b32 s25, s27 -; GCN-O0-NEXT: ; kill: def $sgpr26 killed $sgpr26 killed $sgpr26_sgpr27 -; GCN-O0-NEXT: s_mov_b64 s[28:29], 4.0 -; GCN-O0-NEXT: s_mov_b32 s27, s29 -; GCN-O0-NEXT: ; kill: def $sgpr28 killed $sgpr28 killed $sgpr28_sgpr29 -; GCN-O0-NEXT: s_mov_b32 s30, 0 -; GCN-O0-NEXT: s_mov_b32 s31, 0x40080000 -; GCN-O0-NEXT: s_mov_b32 s29, s31 -; GCN-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 killed $sgpr30_sgpr31 -; GCN-O0-NEXT: s_mov_b64 s[34:35], 2.0 -; GCN-O0-NEXT: s_mov_b32 s31, s35 -; GCN-O0-NEXT: s_mov_b32 s33, s34 -; GCN-O0-NEXT: s_mov_b64 s[36:37], 1.0 -; GCN-O0-NEXT: s_mov_b32 s34, s37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37 -; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr3 -; GCN-O0-NEXT: ; implicit-def: $sgpr35 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GCN-O0-NEXT: s_mov_b32 s37, s34 -; GCN-O0-NEXT: s_mov_b32 s38, s33 -; GCN-O0-NEXT: s_mov_b32 s39, s31 -; GCN-O0-NEXT: s_mov_b32 s40, s30 -; GCN-O0-NEXT: s_mov_b32 s41, s29 -; GCN-O0-NEXT: s_mov_b32 s42, s28 -; GCN-O0-NEXT: s_mov_b32 s43, s27 -; GCN-O0-NEXT: s_mov_b32 s44, s26 -; GCN-O0-NEXT: s_mov_b32 s45, s25 -; GCN-O0-NEXT: s_mov_b32 s46, s24 -; GCN-O0-NEXT: s_mov_b32 s47, s23 -; GCN-O0-NEXT: s_mov_b32 s48, s22 -; GCN-O0-NEXT: s_mov_b32 s49, s21 -; GCN-O0-NEXT: s_mov_b32 s50, s20 -; GCN-O0-NEXT: s_mov_b32 s51, s19 -; GCN-O0-NEXT: s_mov_b32 s52, s18 -; GCN-O0-NEXT: s_mov_b32 s53, s17 -; GCN-O0-NEXT: s_mov_b32 s54, s16 -; GCN-O0-NEXT: s_mov_b32 s55, s15 -; GCN-O0-NEXT: s_mov_b32 s56, s14 -; GCN-O0-NEXT: s_mov_b32 s57, s13 -; GCN-O0-NEXT: s_mov_b32 s58, s12 -; GCN-O0-NEXT: s_mov_b32 s59, s11 -; GCN-O0-NEXT: s_mov_b32 s60, s10 -; GCN-O0-NEXT: s_mov_b32 s61, s9 -; GCN-O0-NEXT: s_mov_b32 s62, s8 -; GCN-O0-NEXT: s_mov_b32 s63, s7 -; GCN-O0-NEXT: s_mov_b32 s64, s6 -; GCN-O0-NEXT: s_mov_b32 s65, s5 -; GCN-O0-NEXT: s_mov_b32 s66, s4 -; GCN-O0-NEXT: s_mov_b32 s67, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v32, s67 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <15 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -1421,181 +572,6 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[31:32] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double16_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s5, 0x40300000 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: s_mov_b32 s7, 0x402e0000 -; GCN-O0-NEXT: s_mov_b32 s5, s7 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s8, 0 -; GCN-O0-NEXT: s_mov_b32 s9, 0x402c0000 -; GCN-O0-NEXT: s_mov_b32 s7, s9 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s10, 0 -; GCN-O0-NEXT: s_mov_b32 s11, 0x402a0000 -; GCN-O0-NEXT: s_mov_b32 s9, s11 -; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s12, 0 -; GCN-O0-NEXT: s_mov_b32 s13, 0x40280000 -; GCN-O0-NEXT: s_mov_b32 s11, s13 -; GCN-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GCN-O0-NEXT: s_mov_b32 s14, 0 -; GCN-O0-NEXT: s_mov_b32 s15, 0x40260000 -; GCN-O0-NEXT: s_mov_b32 s13, s15 -; GCN-O0-NEXT: ; kill: def $sgpr14 killed $sgpr14 killed $sgpr14_sgpr15 -; GCN-O0-NEXT: s_mov_b32 s16, 0 -; GCN-O0-NEXT: s_mov_b32 s17, 0x40240000 -; GCN-O0-NEXT: s_mov_b32 s15, s17 -; GCN-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 -; GCN-O0-NEXT: s_mov_b32 s18, 0 -; GCN-O0-NEXT: s_mov_b32 s19, 0x40220000 -; GCN-O0-NEXT: s_mov_b32 s17, s19 -; GCN-O0-NEXT: ; kill: def $sgpr18 killed $sgpr18 killed $sgpr18_sgpr19 -; GCN-O0-NEXT: s_mov_b32 s20, 0 -; GCN-O0-NEXT: s_mov_b32 s21, 0x40200000 -; GCN-O0-NEXT: s_mov_b32 s19, s21 -; GCN-O0-NEXT: ; kill: def $sgpr20 killed $sgpr20 killed $sgpr20_sgpr21 -; GCN-O0-NEXT: s_mov_b32 s22, 0 -; GCN-O0-NEXT: s_mov_b32 s23, 0x401c0000 -; GCN-O0-NEXT: s_mov_b32 s21, s23 -; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 killed $sgpr22_sgpr23 -; GCN-O0-NEXT: s_mov_b32 s24, 0 -; GCN-O0-NEXT: s_mov_b32 s25, 0x40180000 -; GCN-O0-NEXT: s_mov_b32 s23, s25 -; GCN-O0-NEXT: ; kill: def $sgpr24 killed $sgpr24 killed $sgpr24_sgpr25 -; GCN-O0-NEXT: s_mov_b32 s26, 0 -; GCN-O0-NEXT: s_mov_b32 s27, 0x40140000 -; GCN-O0-NEXT: s_mov_b32 s25, s27 -; GCN-O0-NEXT: ; kill: def $sgpr26 killed $sgpr26 killed $sgpr26_sgpr27 -; GCN-O0-NEXT: s_mov_b64 s[28:29], 4.0 -; GCN-O0-NEXT: s_mov_b32 s27, s29 -; GCN-O0-NEXT: ; kill: def $sgpr28 killed $sgpr28 killed $sgpr28_sgpr29 -; GCN-O0-NEXT: s_mov_b32 s30, 0 -; GCN-O0-NEXT: s_mov_b32 s31, 0x40080000 -; GCN-O0-NEXT: s_mov_b32 s29, s31 -; GCN-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 killed $sgpr30_sgpr31 -; GCN-O0-NEXT: s_mov_b64 s[34:35], 2.0 -; GCN-O0-NEXT: s_mov_b32 s31, s35 -; GCN-O0-NEXT: s_mov_b32 s33, s34 -; GCN-O0-NEXT: s_mov_b64 s[36:37], 1.0 -; GCN-O0-NEXT: s_mov_b32 s34, s37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GCN-O0-NEXT: s_mov_b32 s37, s34 -; GCN-O0-NEXT: s_mov_b32 s38, s33 -; GCN-O0-NEXT: s_mov_b32 s39, s31 -; GCN-O0-NEXT: s_mov_b32 s40, s30 -; GCN-O0-NEXT: s_mov_b32 s41, s29 -; GCN-O0-NEXT: s_mov_b32 s42, s28 -; GCN-O0-NEXT: s_mov_b32 s43, s27 -; GCN-O0-NEXT: s_mov_b32 s44, s26 -; GCN-O0-NEXT: s_mov_b32 s45, s25 -; GCN-O0-NEXT: s_mov_b32 s46, s24 -; GCN-O0-NEXT: s_mov_b32 s47, s23 -; GCN-O0-NEXT: s_mov_b32 s48, s22 -; GCN-O0-NEXT: s_mov_b32 s49, s21 -; GCN-O0-NEXT: s_mov_b32 s50, s20 -; GCN-O0-NEXT: s_mov_b32 s51, s19 -; GCN-O0-NEXT: s_mov_b32 s52, s18 -; GCN-O0-NEXT: s_mov_b32 s53, s17 -; GCN-O0-NEXT: s_mov_b32 s54, s16 -; GCN-O0-NEXT: s_mov_b32 s55, s15 -; GCN-O0-NEXT: s_mov_b32 s56, s14 -; GCN-O0-NEXT: s_mov_b32 s57, s13 -; GCN-O0-NEXT: s_mov_b32 s58, s12 -; GCN-O0-NEXT: s_mov_b32 s59, s11 -; GCN-O0-NEXT: s_mov_b32 s60, s10 -; GCN-O0-NEXT: s_mov_b32 s61, s9 -; GCN-O0-NEXT: s_mov_b32 s62, s8 -; GCN-O0-NEXT: s_mov_b32 s63, s7 -; GCN-O0-NEXT: s_mov_b32 s64, s6 -; GCN-O0-NEXT: s_mov_b32 s65, s5 -; GCN-O0-NEXT: s_mov_b32 s66, s4 -; GCN-O0-NEXT: s_mov_b32 s67, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v32, s67 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v1 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <16 x double> , i32 %sel store double %ext, ptr addrspace(1) %out @@ -1646,114 +622,6 @@ define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float32_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x42000000 -; GCN-O0-NEXT: s_mov_b32 s4, 0x41f80000 -; GCN-O0-NEXT: s_mov_b32 s5, 0x41f00000 -; GCN-O0-NEXT: s_mov_b32 s6, 0x41e80000 -; GCN-O0-NEXT: s_mov_b32 s7, 0x41e00000 -; GCN-O0-NEXT: s_mov_b32 s8, 0x41d80000 -; GCN-O0-NEXT: s_mov_b32 s9, 0x41d00000 -; GCN-O0-NEXT: s_mov_b32 s10, 0x41c80000 -; GCN-O0-NEXT: s_mov_b32 s11, 0x41c00000 -; GCN-O0-NEXT: s_mov_b32 s12, 0x41b80000 -; GCN-O0-NEXT: s_mov_b32 s13, 0x41b00000 -; GCN-O0-NEXT: s_mov_b32 s14, 0x41a80000 -; GCN-O0-NEXT: s_mov_b32 s15, 0x41a00000 -; GCN-O0-NEXT: s_mov_b32 s16, 0x41980000 -; GCN-O0-NEXT: s_mov_b32 s17, 0x41900000 -; GCN-O0-NEXT: s_mov_b32 s18, 0x41880000 -; GCN-O0-NEXT: s_mov_b32 s19, 0x41800000 -; GCN-O0-NEXT: s_mov_b32 s20, 0x41700000 -; GCN-O0-NEXT: s_mov_b32 s21, 0x41600000 -; GCN-O0-NEXT: s_mov_b32 s22, 0x41500000 -; GCN-O0-NEXT: s_mov_b32 s23, 0x41400000 -; GCN-O0-NEXT: s_mov_b32 s24, 0x41300000 -; GCN-O0-NEXT: s_mov_b32 s25, 0x41200000 -; GCN-O0-NEXT: s_mov_b32 s26, 0x41100000 -; GCN-O0-NEXT: s_mov_b32 s27, 0x41000000 -; GCN-O0-NEXT: s_mov_b32 s28, 0x40e00000 -; GCN-O0-NEXT: s_mov_b32 s29, 0x40c00000 -; GCN-O0-NEXT: s_mov_b32 s30, 0x40a00000 -; GCN-O0-NEXT: s_mov_b32 s31, 4.0 -; GCN-O0-NEXT: s_mov_b32 s33, 0x40400000 -; GCN-O0-NEXT: s_mov_b32 s34, 2.0 -; GCN-O0-NEXT: s_mov_b32 s35, 1.0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s35 -; GCN-O0-NEXT: v_mov_b32_e32 v62, s34 -; GCN-O0-NEXT: v_mov_b32_e32 v61, s33 -; GCN-O0-NEXT: v_mov_b32_e32 v60, s31 -; GCN-O0-NEXT: v_mov_b32_e32 v59, s30 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s29 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s28 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s27 -; GCN-O0-NEXT: v_mov_b32_e32 v55, s26 -; GCN-O0-NEXT: v_mov_b32_e32 v54, s25 -; GCN-O0-NEXT: v_mov_b32_e32 v53, s24 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v47, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v46, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v45, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v39, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v38, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v37, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v36, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v35, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v34, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v33, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v32, s3 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movrels_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <32 x float> , i32 %sel store float %ext, ptr addrspace(1) %out @@ -1775,25 +643,6 @@ define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: byte8_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s1, 3 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s4, s0, s1 -; GCN-O0-NEXT: s_mov_b32 s5, 0x8070605 -; GCN-O0-NEXT: s_mov_b32 s0, 0x4030201 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <8 x i8> , i32 %sel store i8 %ext, ptr addrspace(1) %out @@ -1841,61 +690,6 @@ define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: byte16_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 15 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 16 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 15 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 14 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 13 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 12 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 11 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 10 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 9 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 8 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 7 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 6 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 5 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_load_ubyte v2, v0, s[12:15], 0 offen -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <16 x i8> , i32 %sel store i8 %ext, ptr addrspace(1) %out @@ -1916,23 +710,6 @@ define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: bit4_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s1, 3 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s1, s0, s1 -; GCN-O0-NEXT: s_mov_b32 s0, 0x1000100 -; GCN-O0-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_and_b32 s0, s0, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <4 x i1> , i32 %sel %zext = zext i1 %ext to i32 @@ -2208,161 +985,6 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: bit128_extelt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: s_mov_b32 s3, 0x7f -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_add_i32 s2, s2, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:127 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:126 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:125 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:124 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:123 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:122 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:121 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:120 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:119 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:118 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:117 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:116 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:115 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:114 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:113 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:112 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:111 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:110 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:109 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:108 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:107 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:106 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:105 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:104 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:103 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:102 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:101 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:100 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:99 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:98 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:97 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:96 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:95 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:94 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:93 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:92 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:91 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:90 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:89 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:88 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:87 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:86 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:85 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:84 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:83 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:82 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:81 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:80 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:79 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:78 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:77 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:76 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:75 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:74 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:73 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:72 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:71 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:70 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:69 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:68 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:67 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:66 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:65 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:64 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:63 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:62 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:61 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:60 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:59 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:58 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:57 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:56 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:55 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:54 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:53 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:52 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:51 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:50 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:49 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:48 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:47 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:46 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:45 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:44 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:43 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:42 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:41 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:40 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:39 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:38 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:37 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:36 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:35 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:34 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:33 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:32 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:31 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:30 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:29 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:28 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:27 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:26 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:25 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:24 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:23 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:22 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:21 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:20 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:19 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:18 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:17 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:16 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_load_ubyte v0, v0, s[12:15], 0 offen -; GCN-O0-NEXT: s_mov_b32 s2, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v2, v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %ext = extractelement <128 x i1> , i32 %sel %zext = zext i1 %ext to i32 @@ -2466,253 +1088,6 @@ define float @float32_extelt_vec(i32 %sel) { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 31, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: float32_extelt_vec: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, 0x42000000 -; GCN-O0-NEXT: s_mov_b32 s5, 0x41f80000 -; GCN-O0-NEXT: s_mov_b32 s6, 0x41f00000 -; GCN-O0-NEXT: s_mov_b32 s7, 0x41e80000 -; GCN-O0-NEXT: s_mov_b32 s8, 0x41e00000 -; GCN-O0-NEXT: s_mov_b32 s9, 0x41d80000 -; GCN-O0-NEXT: s_mov_b32 s10, 0x41d00000 -; GCN-O0-NEXT: s_mov_b32 s11, 0x41c80000 -; GCN-O0-NEXT: s_mov_b32 s12, 0x41c00000 -; GCN-O0-NEXT: s_mov_b32 s13, 0x41b80000 -; GCN-O0-NEXT: s_mov_b32 s14, 0x41b00000 -; GCN-O0-NEXT: s_mov_b32 s15, 0x41a80000 -; GCN-O0-NEXT: s_mov_b32 s16, 0x41a00000 -; GCN-O0-NEXT: s_mov_b32 s17, 0x41980000 -; GCN-O0-NEXT: s_mov_b32 s18, 0x41900000 -; GCN-O0-NEXT: s_mov_b32 s19, 0x41880000 -; GCN-O0-NEXT: s_mov_b32 s20, 0x41800000 -; GCN-O0-NEXT: s_mov_b32 s21, 0x41700000 -; GCN-O0-NEXT: s_mov_b32 s22, 0x41600000 -; GCN-O0-NEXT: s_mov_b32 s23, 0x41500000 -; GCN-O0-NEXT: s_mov_b32 s24, 0x41400000 -; GCN-O0-NEXT: s_mov_b32 s25, 0x41300000 -; GCN-O0-NEXT: s_mov_b32 s26, 0x41200000 -; GCN-O0-NEXT: s_mov_b32 s27, 0x41100000 -; GCN-O0-NEXT: s_mov_b32 s28, 0x41000000 -; GCN-O0-NEXT: s_mov_b32 s29, 0x40e00000 -; GCN-O0-NEXT: s_mov_b32 s40, 0x40c00000 -; GCN-O0-NEXT: s_mov_b32 s41, 0x40a00000 -; GCN-O0-NEXT: s_mov_b32 s42, 4.0 -; GCN-O0-NEXT: s_mov_b32 s43, 0x40400000 -; GCN-O0-NEXT: s_mov_b32 s44, 2.0 -; GCN-O0-NEXT: s_mov_b32 s45, 1.0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v62, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v61, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v60, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v59, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 -; GCN-O0-NEXT: v_mov_b32_e32 v47, s27 -; GCN-O0-NEXT: v_mov_b32_e32 v46, s26 -; GCN-O0-NEXT: v_mov_b32_e32 v45, s25 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s24 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v55, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v54, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v53, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v38, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v37, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v36, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v35, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v34, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v33, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v32, s4 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr63 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v63, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v63, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GCN-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB20_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GCN-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v63, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v63, 3 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v63, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v63, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GCN-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB20_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GCN-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[46:47] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v63, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v63, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <32 x float> , i32 %sel ret float %ext @@ -2788,1692 +1163,7 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x40301999 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: double16_extelt_vec: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v34, s36, 0 -; GCN-O0-NEXT: v_writelane_b32 v34, s37, 1 -; GCN-O0-NEXT: v_writelane_b32 v34, s38, 2 -; GCN-O0-NEXT: v_writelane_b32 v34, s39, 3 -; GCN-O0-NEXT: v_writelane_b32 v34, s48, 4 -; GCN-O0-NEXT: v_writelane_b32 v34, s49, 5 -; GCN-O0-NEXT: v_writelane_b32 v34, s50, 6 -; GCN-O0-NEXT: v_writelane_b32 v34, s51, 7 -; GCN-O0-NEXT: v_writelane_b32 v34, s52, 8 -; GCN-O0-NEXT: v_writelane_b32 v34, s53, 9 -; GCN-O0-NEXT: v_writelane_b32 v34, s54, 10 -; GCN-O0-NEXT: v_writelane_b32 v34, s55, 11 -; GCN-O0-NEXT: v_writelane_b32 v34, s64, 12 -; GCN-O0-NEXT: v_writelane_b32 v34, s65, 13 -; GCN-O0-NEXT: v_writelane_b32 v34, s66, 14 -; GCN-O0-NEXT: v_writelane_b32 v34, s67, 15 -; GCN-O0-NEXT: s_mov_b32 s4, 0x40301999 -; GCN-O0-NEXT: s_mov_b32 s40, 0x9999999a -; GCN-O0-NEXT: s_mov_b32 s6, s40 -; GCN-O0-NEXT: s_mov_b32 s7, s4 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s6, 0x402e3333 -; GCN-O0-NEXT: s_mov_b32 s22, 0x33333333 -; GCN-O0-NEXT: ; implicit-def: $vgpr35 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v35, s22, 0 -; GCN-O0-NEXT: s_mov_b32 s8, s22 -; GCN-O0-NEXT: s_mov_b32 s9, s6 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s8, 0x402c3333 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: s_mov_b32 s11, s8 -; GCN-O0-NEXT: s_mov_b32 s8, s11 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: s_mov_b32 s10, 0x402a3333 -; GCN-O0-NEXT: s_mov_b32 s12, s22 -; GCN-O0-NEXT: s_mov_b32 s13, s10 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s12, 0x40283333 -; GCN-O0-NEXT: s_mov_b32 s14, s22 -; GCN-O0-NEXT: s_mov_b32 s15, s12 -; GCN-O0-NEXT: s_mov_b32 s12, s15 -; GCN-O0-NEXT: s_mov_b32 s13, s14 -; GCN-O0-NEXT: s_mov_b32 s14, 0x40263333 -; GCN-O0-NEXT: s_mov_b32 s16, s22 -; GCN-O0-NEXT: s_mov_b32 s17, s14 -; GCN-O0-NEXT: s_mov_b32 s14, s17 -; GCN-O0-NEXT: s_mov_b32 s15, s16 -; GCN-O0-NEXT: s_mov_b32 s16, 0x40243333 -; GCN-O0-NEXT: s_mov_b32 s18, s22 -; GCN-O0-NEXT: s_mov_b32 s19, s16 -; GCN-O0-NEXT: s_mov_b32 s16, s19 -; GCN-O0-NEXT: s_mov_b32 s17, s18 -; GCN-O0-NEXT: s_mov_b32 s18, 0x40223333 -; GCN-O0-NEXT: s_mov_b32 s20, s22 -; GCN-O0-NEXT: s_mov_b32 s21, s18 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: s_mov_b32 s20, 0x40203333 -; GCN-O0-NEXT: ; kill: def $sgpr22 killed $sgpr22 def $sgpr22_sgpr23 -; GCN-O0-NEXT: s_mov_b32 s23, s20 -; GCN-O0-NEXT: s_mov_b32 s20, s23 -; GCN-O0-NEXT: s_mov_b32 s21, s22 -; GCN-O0-NEXT: s_mov_b32 s22, 0x401c6666 -; GCN-O0-NEXT: s_mov_b32 s42, 0x66666666 -; GCN-O0-NEXT: s_mov_b32 s24, s42 -; GCN-O0-NEXT: s_mov_b32 s25, s22 -; GCN-O0-NEXT: s_mov_b32 s22, s25 -; GCN-O0-NEXT: s_mov_b32 s23, s24 -; GCN-O0-NEXT: s_mov_b32 s24, 0x40186666 -; GCN-O0-NEXT: s_mov_b32 s26, s42 -; GCN-O0-NEXT: s_mov_b32 s27, s24 -; GCN-O0-NEXT: s_mov_b32 s24, s27 -; GCN-O0-NEXT: s_mov_b32 s25, s26 -; GCN-O0-NEXT: s_mov_b32 s26, 0x40146666 -; GCN-O0-NEXT: s_mov_b32 s28, s42 -; GCN-O0-NEXT: s_mov_b32 s29, s26 -; GCN-O0-NEXT: s_mov_b32 s26, s29 -; GCN-O0-NEXT: s_mov_b32 s27, s28 -; GCN-O0-NEXT: s_mov_b32 s28, 0x40106666 -; GCN-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 -; GCN-O0-NEXT: s_mov_b32 s43, s28 -; GCN-O0-NEXT: s_mov_b32 s28, s43 -; GCN-O0-NEXT: s_mov_b32 s29, s42 -; GCN-O0-NEXT: s_mov_b32 s41, 0x4008cccc -; GCN-O0-NEXT: s_mov_b32 s42, 0xcccccccd -; GCN-O0-NEXT: s_mov_b32 s44, s42 -; GCN-O0-NEXT: s_mov_b32 s45, s41 -; GCN-O0-NEXT: s_mov_b32 s72, s45 -; GCN-O0-NEXT: s_mov_b32 s73, s44 -; GCN-O0-NEXT: s_mov_b32 s41, 0x4000cccc -; GCN-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 -; GCN-O0-NEXT: s_mov_b32 s43, s41 -; GCN-O0-NEXT: s_mov_b32 s74, s43 -; GCN-O0-NEXT: s_mov_b32 s75, s42 -; GCN-O0-NEXT: s_mov_b32 s42, 0x3ff19999 -; GCN-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GCN-O0-NEXT: s_mov_b32 s41, s42 -; GCN-O0-NEXT: s_mov_b32 s76, s41 -; GCN-O0-NEXT: s_mov_b32 s36, s40 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GCN-O0-NEXT: s_mov_b32 s37, s76 -; GCN-O0-NEXT: s_mov_b32 s38, s75 -; GCN-O0-NEXT: s_mov_b32 s39, s74 -; GCN-O0-NEXT: s_mov_b32 s40, s73 -; GCN-O0-NEXT: s_mov_b32 s41, s72 -; GCN-O0-NEXT: s_mov_b32 s42, s29 -; GCN-O0-NEXT: s_mov_b32 s43, s28 -; GCN-O0-NEXT: s_mov_b32 s44, s27 -; GCN-O0-NEXT: s_mov_b32 s45, s26 -; GCN-O0-NEXT: s_mov_b32 s46, s25 -; GCN-O0-NEXT: s_mov_b32 s47, s24 -; GCN-O0-NEXT: s_mov_b32 s48, s23 -; GCN-O0-NEXT: s_mov_b32 s49, s22 -; GCN-O0-NEXT: s_mov_b32 s50, s21 -; GCN-O0-NEXT: s_mov_b32 s51, s20 -; GCN-O0-NEXT: s_mov_b32 s52, s19 -; GCN-O0-NEXT: s_mov_b32 s53, s18 -; GCN-O0-NEXT: s_mov_b32 s54, s17 -; GCN-O0-NEXT: s_mov_b32 s55, s16 -; GCN-O0-NEXT: s_mov_b32 s56, s15 -; GCN-O0-NEXT: s_mov_b32 s57, s14 -; GCN-O0-NEXT: s_mov_b32 s58, s13 -; GCN-O0-NEXT: s_mov_b32 s59, s12 -; GCN-O0-NEXT: s_mov_b32 s60, s11 -; GCN-O0-NEXT: s_mov_b32 s61, s10 -; GCN-O0-NEXT: s_mov_b32 s62, s9 -; GCN-O0-NEXT: s_mov_b32 s63, s8 -; GCN-O0-NEXT: s_mov_b32 s64, s7 -; GCN-O0-NEXT: s_mov_b32 s65, s6 -; GCN-O0-NEXT: s_mov_b32 s66, s5 -; GCN-O0-NEXT: s_mov_b32 s67, s4 -; GCN-O0-NEXT: v_writelane_b32 v35, s36, 1 -; GCN-O0-NEXT: v_writelane_b32 v35, s37, 2 -; GCN-O0-NEXT: v_writelane_b32 v35, s38, 3 -; GCN-O0-NEXT: v_writelane_b32 v35, s39, 4 -; GCN-O0-NEXT: v_writelane_b32 v35, s40, 5 -; GCN-O0-NEXT: v_writelane_b32 v35, s41, 6 -; GCN-O0-NEXT: v_writelane_b32 v35, s42, 7 -; GCN-O0-NEXT: v_writelane_b32 v35, s43, 8 -; GCN-O0-NEXT: v_writelane_b32 v35, s44, 9 -; GCN-O0-NEXT: v_writelane_b32 v35, s45, 10 -; GCN-O0-NEXT: v_writelane_b32 v35, s46, 11 -; GCN-O0-NEXT: v_writelane_b32 v35, s47, 12 -; GCN-O0-NEXT: v_writelane_b32 v35, s48, 13 -; GCN-O0-NEXT: v_writelane_b32 v35, s49, 14 -; GCN-O0-NEXT: v_writelane_b32 v35, s50, 15 -; GCN-O0-NEXT: v_writelane_b32 v35, s51, 16 -; GCN-O0-NEXT: v_writelane_b32 v35, s52, 17 -; GCN-O0-NEXT: v_writelane_b32 v35, s53, 18 -; GCN-O0-NEXT: v_writelane_b32 v35, s54, 19 -; GCN-O0-NEXT: v_writelane_b32 v35, s55, 20 -; GCN-O0-NEXT: v_writelane_b32 v35, s56, 21 -; GCN-O0-NEXT: v_writelane_b32 v35, s57, 22 -; GCN-O0-NEXT: v_writelane_b32 v35, s58, 23 -; GCN-O0-NEXT: v_writelane_b32 v35, s59, 24 -; GCN-O0-NEXT: v_writelane_b32 v35, s60, 25 -; GCN-O0-NEXT: v_writelane_b32 v35, s61, 26 -; GCN-O0-NEXT: v_writelane_b32 v35, s62, 27 -; GCN-O0-NEXT: v_writelane_b32 v35, s63, 28 -; GCN-O0-NEXT: v_writelane_b32 v35, s64, 29 -; GCN-O0-NEXT: v_writelane_b32 v35, s65, 30 -; GCN-O0-NEXT: v_writelane_b32 v35, s66, 31 -; GCN-O0-NEXT: v_writelane_b32 v35, s67, 32 -; GCN-O0-NEXT: s_mov_b32 s4, 1 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v35, s4, 33 -; GCN-O0-NEXT: v_writelane_b32 v35, s5, 34 -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v35, 35 -; GCN-O0-NEXT: v_readlane_b32 s5, v35, 36 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v35, s6, 35 -; GCN-O0-NEXT: v_writelane_b32 v35, s7, 36 -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB21_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v35, 33 -; GCN-O0-NEXT: v_readlane_b32 s5, v35, 34 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s36, v35, 1 -; GCN-O0-NEXT: v_readlane_b32 s37, v35, 2 -; GCN-O0-NEXT: v_readlane_b32 s38, v35, 3 -; GCN-O0-NEXT: v_readlane_b32 s39, v35, 4 -; GCN-O0-NEXT: v_readlane_b32 s40, v35, 5 -; GCN-O0-NEXT: v_readlane_b32 s41, v35, 6 -; GCN-O0-NEXT: v_readlane_b32 s42, v35, 7 -; GCN-O0-NEXT: v_readlane_b32 s43, v35, 8 -; GCN-O0-NEXT: v_readlane_b32 s44, v35, 9 -; GCN-O0-NEXT: v_readlane_b32 s45, v35, 10 -; GCN-O0-NEXT: v_readlane_b32 s46, v35, 11 -; GCN-O0-NEXT: v_readlane_b32 s47, v35, 12 -; GCN-O0-NEXT: v_readlane_b32 s48, v35, 13 -; GCN-O0-NEXT: v_readlane_b32 s49, v35, 14 -; GCN-O0-NEXT: v_readlane_b32 s50, v35, 15 -; GCN-O0-NEXT: v_readlane_b32 s51, v35, 16 -; GCN-O0-NEXT: v_readlane_b32 s52, v35, 17 -; GCN-O0-NEXT: v_readlane_b32 s53, v35, 18 -; GCN-O0-NEXT: v_readlane_b32 s54, v35, 19 -; GCN-O0-NEXT: v_readlane_b32 s55, v35, 20 -; GCN-O0-NEXT: v_readlane_b32 s56, v35, 21 -; GCN-O0-NEXT: v_readlane_b32 s57, v35, 22 -; GCN-O0-NEXT: v_readlane_b32 s58, v35, 23 -; GCN-O0-NEXT: v_readlane_b32 s59, v35, 24 -; GCN-O0-NEXT: v_readlane_b32 s60, v35, 25 -; GCN-O0-NEXT: v_readlane_b32 s61, v35, 26 -; GCN-O0-NEXT: v_readlane_b32 s62, v35, 27 -; GCN-O0-NEXT: v_readlane_b32 s63, v35, 28 -; GCN-O0-NEXT: v_readlane_b32 s64, v35, 29 -; GCN-O0-NEXT: v_readlane_b32 s65, v35, 30 -; GCN-O0-NEXT: v_readlane_b32 s66, v35, 31 -; GCN-O0-NEXT: v_readlane_b32 s67, v35, 32 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s37 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s38 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s39 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s40 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s41 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s42 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s43 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s44 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s45 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s46 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s47 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s48 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s49 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s50 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s51 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s52 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s54 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s55 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s56 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s57 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s58 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s59 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s60 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s61 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s62 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s63 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s64 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s65 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s66 -; GCN-O0-NEXT: v_mov_b32_e32 v31, s67 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v35, s4, 37 -; GCN-O0-NEXT: v_writelane_b32 v35, s5, 38 -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB21_4: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v35, 39 -; GCN-O0-NEXT: v_readlane_b32 s5, v35, 40 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v32 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v32 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v35, s6, 39 -; GCN-O0-NEXT: v_writelane_b32 v35, s7, 40 -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-O0-NEXT: ; %bb.5: -; GCN-O0-NEXT: s_or_saveexec_b64 s[78:79], -1 -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[78:79] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v35, 37 -; GCN-O0-NEXT: v_readlane_b32 s5, v35, 38 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.6: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 -; GCN-O0-NEXT: s_mov_b32 s4, 32 -; GCN-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec -; GCN-O0-NEXT: v_readlane_b32 s67, v34, 15 -; GCN-O0-NEXT: v_readlane_b32 s66, v34, 14 -; GCN-O0-NEXT: v_readlane_b32 s65, v34, 13 -; GCN-O0-NEXT: v_readlane_b32 s64, v34, 12 -; GCN-O0-NEXT: v_readlane_b32 s55, v34, 11 -; GCN-O0-NEXT: v_readlane_b32 s54, v34, 10 -; GCN-O0-NEXT: v_readlane_b32 s53, v34, 9 -; GCN-O0-NEXT: v_readlane_b32 s52, v34, 8 -; GCN-O0-NEXT: v_readlane_b32 s51, v34, 7 -; GCN-O0-NEXT: v_readlane_b32 s50, v34, 6 -; GCN-O0-NEXT: v_readlane_b32 s49, v34, 5 -; GCN-O0-NEXT: v_readlane_b32 s48, v34, 4 -; GCN-O0-NEXT: v_readlane_b32 s39, v34, 3 -; GCN-O0-NEXT: v_readlane_b32 s38, v34, 2 -; GCN-O0-NEXT: v_readlane_b32 s37, v34, 1 -; GCN-O0-NEXT: v_readlane_b32 s36, v34, 0 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <16 x double> , i32 %sel ret double %ext } - -define i32 @extract_dyn_i32_3(<3 x i32> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_i32_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_i32_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB22_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v3 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v3 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB22_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <3 x i32> %arg, i32 %idx - ret i32 %x -} - -define i32 @extract_dyn_inreg_i32_3(<3 x i32> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_i32_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s19, 1 -; GCN-NEXT: s_cselect_b32 s4, s17, s16 -; GCN-NEXT: s_cmp_eq_u32 s19, 2 -; GCN-NEXT: s_cselect_b32 s4, s18, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_i32_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: s_mov_b32 m0, s19 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <3 x i32> %arg, i32 %idx - ret i32 %x -} - -define float @extract_dyn_float_3(<3 x float> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_float_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_float_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v3 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v3 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB24_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <3 x float> %arg, i32 %idx - ret float %x -} - -define float @extract_dyn_inreg_float_3(<3 x float> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_float_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s19, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s19, 2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_float_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: s_mov_b32 m0, s19 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <3 x float> %arg, i32 %idx - ret float %x -} - -define i32 @extract_dyn_i32_5(<5 x i32> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_i32_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_i32_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 3 -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v5 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v5 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB26_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <5 x i32> %arg, i32 %idx - ret i32 %x -} - -define i32 @extract_dyn_inreg_i32_5(<5 x i32> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_i32_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s21, 1 -; GCN-NEXT: s_cselect_b32 s4, s17, s16 -; GCN-NEXT: s_cmp_eq_u32 s21, 2 -; GCN-NEXT: s_cselect_b32 s4, s18, s4 -; GCN-NEXT: s_cmp_eq_u32 s21, 3 -; GCN-NEXT: s_cselect_b32 s4, s19, s4 -; GCN-NEXT: s_cmp_eq_u32 s21, 4 -; GCN-NEXT: s_cselect_b32 s4, s20, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_i32_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: s_mov_b32 m0, s21 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <5 x i32> %arg, i32 %idx - ret i32 %x -} - -define float @extract_dyn_float_5(<5 x float> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_float_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_float_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v7, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v7, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 3 -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v5 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v5 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB28_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v7, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v7, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <5 x float> %arg, i32 %idx - ret float %x -} - -define float @extract_dyn_inreg_float_5(<5 x float> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_float_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s21, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_float_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: s_mov_b32 m0, s21 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <5 x float> %arg, i32 %idx - ret float %x -} - -define i32 @extract_dyn_i32_6(<6 x i32> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_i32_6: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_i32_6: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v8, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v8, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB30_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v8, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v8, 3 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v8, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v8, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB30_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v8, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v8, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] -entry: - %x = extractelement <6 x i32> %arg, i32 %idx - ret i32 %x -} - -define i32 @extract_dyn_inreg_i32_6(<6 x i32> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_i32_6: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s22, 1 -; GCN-NEXT: s_cselect_b32 s4, s17, s16 -; GCN-NEXT: s_cmp_eq_u32 s22, 2 -; GCN-NEXT: s_cselect_b32 s4, s18, s4 -; GCN-NEXT: s_cmp_eq_u32 s22, 3 -; GCN-NEXT: s_cselect_b32 s4, s19, s4 -; GCN-NEXT: s_cmp_eq_u32 s22, 4 -; GCN-NEXT: s_cselect_b32 s4, s20, s4 -; GCN-NEXT: s_cmp_eq_u32 s22, 5 -; GCN-NEXT: s_cselect_b32 s4, s21, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_i32_6: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: s_mov_b32 m0, s22 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] -entry: - %x = extractelement <6 x i32> %arg, i32 %idx - ret i32 %x -} - -define float @extract_dyn_float_6(<6 x float> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_float_6: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_float_6: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v8, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v8, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB32_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v8, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v8, 3 -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v8, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v8, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB32_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v8, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v8, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] -entry: - %x = extractelement <6 x float> %arg, i32 %idx - ret float %x -} - -define float @extract_dyn_inreg_float_6(<6 x float> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_float_6: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s22, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 5 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_float_6: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: s_mov_b32 m0, s22 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] -entry: - %x = extractelement <6 x float> %arg, i32 %idx - ret float %x -} - -define i32 @extract_dyn_i32_7(<7 x i32> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_i32_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_i32_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB34_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB34_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <7 x i32> %arg, i32 %idx - ret i32 %x -} - -define i32 @extract_dyn_inreg_i32_7(<7 x i32> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_i32_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s23, 1 -; GCN-NEXT: s_cselect_b32 s4, s17, s16 -; GCN-NEXT: s_cmp_eq_u32 s23, 2 -; GCN-NEXT: s_cselect_b32 s4, s18, s4 -; GCN-NEXT: s_cmp_eq_u32 s23, 3 -; GCN-NEXT: s_cselect_b32 s4, s19, s4 -; GCN-NEXT: s_cmp_eq_u32 s23, 4 -; GCN-NEXT: s_cselect_b32 s4, s20, s4 -; GCN-NEXT: s_cmp_eq_u32 s23, 5 -; GCN-NEXT: s_cselect_b32 s4, s21, s4 -; GCN-NEXT: s_cmp_eq_u32 s23, 6 -; GCN-NEXT: s_cselect_b32 s4, s22, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_i32_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: s_mov_b32 m0, s23 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <7 x i32> %arg, i32 %idx - ret i32 %x -} - -define float @extract_dyn_float_7(<7 x float> inreg %arg, i32 %idx) { -; GCN-LABEL: extract_dyn_float_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_float_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB36_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB36_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <7 x float> %arg, i32 %idx - ret float %x -} - -define float @extract_dyn_inreg_float_7(<7 x float> inreg %arg, i32 inreg %idx) { -; GCN-LABEL: extract_dyn_inreg_float_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s23, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 5 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: extract_dyn_inreg_float_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: s_mov_b32 m0, s23 -; GCN-O0-NEXT: v_movrels_b32_e32 v0, v0 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = extractelement <7 x float> %arg, i32 %idx - ret float %x -} diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index beeeaa32cacfd..e1b4cad370f96 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: @@ -29,25 +28,6 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float4_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel store <4 x float> %v, ptr addrspace(1) %out @@ -67,24 +47,6 @@ define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float4_inselt_undef: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x float> poison, float 1.000000e+00, i32 %sel store <4 x float> %v, ptr addrspace(1) %out @@ -114,25 +76,6 @@ define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i3 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: int4_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x i32> %vec, i32 1, i32 %sel store <4 x i32> %v, ptr addrspace(1) %out @@ -157,23 +100,6 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float2_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x34 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel store <2 x float> %v, ptr addrspace(1) %out @@ -207,57 +133,6 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float8_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x64 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v14, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s4 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel store <8 x float> %v, ptr addrspace(1) %out @@ -311,105 +186,6 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float16_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0xa4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v22, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s4 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel store <16 x float> %v, ptr addrspace(1) %out @@ -504,267 +280,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: float32_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s2, s51 -; GCN-O0-NEXT: s_mov_b32 s3, s50 -; GCN-O0-NEXT: s_mov_b32 s6, s49 -; GCN-O0-NEXT: s_mov_b32 s7, s48 -; GCN-O0-NEXT: s_mov_b32 s8, s47 -; GCN-O0-NEXT: s_mov_b32 s9, s46 -; GCN-O0-NEXT: s_mov_b32 s10, s45 -; GCN-O0-NEXT: s_mov_b32 s11, s44 -; GCN-O0-NEXT: s_mov_b32 s12, s43 -; GCN-O0-NEXT: s_mov_b32 s13, s42 -; GCN-O0-NEXT: s_mov_b32 s14, s41 -; GCN-O0-NEXT: s_mov_b32 s15, s40 -; GCN-O0-NEXT: s_mov_b32 s16, s39 -; GCN-O0-NEXT: s_mov_b32 s17, s38 -; GCN-O0-NEXT: s_mov_b32 s18, s37 -; GCN-O0-NEXT: s_mov_b32 s19, s36 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s20, s51 -; GCN-O0-NEXT: s_mov_b32 s21, s50 -; GCN-O0-NEXT: s_mov_b32 s22, s49 -; GCN-O0-NEXT: s_mov_b32 s23, s48 -; GCN-O0-NEXT: s_mov_b32 s24, s47 -; GCN-O0-NEXT: s_mov_b32 s25, s46 -; GCN-O0-NEXT: s_mov_b32 s26, s45 -; GCN-O0-NEXT: s_mov_b32 s27, s44 -; GCN-O0-NEXT: s_mov_b32 s28, s43 -; GCN-O0-NEXT: s_mov_b32 s29, s42 -; GCN-O0-NEXT: s_mov_b32 s30, s41 -; GCN-O0-NEXT: s_mov_b32 s31, s40 -; GCN-O0-NEXT: s_mov_b32 s33, s39 -; GCN-O0-NEXT: s_mov_b32 s34, s38 -; GCN-O0-NEXT: s_mov_b32 s35, s37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 -; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 -; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 -; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 -; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 -; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 -; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v38 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v37 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v35 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel store <32 x float> %v, ptr addrspace(1) %out @@ -790,30 +305,6 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: half4_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-O0-NEXT: s_mov_b32 s7, 0x3c003c00 -; GCN-O0-NEXT: s_mov_b32 s0, s7 -; GCN-O0-NEXT: s_mov_b32 s1, s7 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GCN-O0-NEXT: s_mov_b32 s7, 4 -; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff -; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel store <4 x half> %v, ptr addrspace(1) %out @@ -835,26 +326,6 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: half2_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c -; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30 -; GCN-O0-NEXT: s_mov_b32 s0, 0x3c003c00 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b32 s0, s1, s0 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xffff -; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 -; GCN-O0-NEXT: s_and_b32 s0, s0, s4 -; GCN-O0-NEXT: s_xor_b32 s0, s0, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel store <2 x half> %v, ptr addrspace(1) %out @@ -916,56 +387,6 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: half8_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44 -; GCN-O0-NEXT: s_mov_b32 s3, 7 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_lshl_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 -; GCN-O0-NEXT: buffer_store_short v0, v1, s[12:15], 0 offen -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 -; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel store <8 x half> %v, ptr addrspace(1) %out @@ -987,26 +408,6 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: short2_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c -; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30 -; GCN-O0-NEXT: s_mov_b32 s0, 0x10001 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b32 s0, s1, s0 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5 -; GCN-O0-NEXT: s_mov_b32 s4, 0xffff -; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5 -; GCN-O0-NEXT: s_and_b32 s0, s0, s4 -; GCN-O0-NEXT: s_xor_b32 s0, s0, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_dword v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x i16> %vec, i16 1, i32 %sel store <2 x i16> %v, ptr addrspace(1) %out @@ -1032,30 +433,6 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: short4_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-O0-NEXT: s_mov_b32 s7, 0x10001 -; GCN-O0-NEXT: s_mov_b32 s0, s7 -; GCN-O0-NEXT: s_mov_b32 s1, s7 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GCN-O0-NEXT: s_mov_b32 s7, 4 -; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff -; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x i16> %vec, i16 1, i32 %sel store <4 x i16> %v, ptr addrspace(1) %out @@ -1080,140 +457,6 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: byte8_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-O0-NEXT: s_mov_b32 s7, 0x1010101 -; GCN-O0-NEXT: s_mov_b32 s0, s7 -; GCN-O0-NEXT: s_mov_b32 s1, s7 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-O0-NEXT: s_mov_b32 s7, 3 -; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xff -; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GCN-O0-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b32 s3, s10 -; GCN-O0-NEXT: s_mov_b32 s0, 8 -; GCN-O0-NEXT: s_lshr_b32 s0, s3, s0 -; GCN-O0-NEXT: s_mov_b32 s1, s10 -; GCN-O0-NEXT: s_mov_b32 s2, 16 -; GCN-O0-NEXT: s_lshr_b32 s2, s3, s2 -; GCN-O0-NEXT: s_mov_b32 s6, 24 -; GCN-O0-NEXT: s_lshr_b32 s3, s3, s6 -; GCN-O0-NEXT: s_mov_b32 s6, 32 -; GCN-O0-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s7, 40 -; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s7 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s8, 48 -; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s8 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, 56 -; GCN-O0-NEXT: s_lshr_b64 s[10:11], s[10:11], s9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: s_mov_b64 s[14:15], 7 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s11, s5 -; GCN-O0-NEXT: s_mov_b32 s13, s14 -; GCN-O0-NEXT: s_mov_b32 s12, s15 -; GCN-O0-NEXT: s_add_u32 s10, s10, s13 -; GCN-O0-NEXT: s_addc_u32 s12, s11, s12 -; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[14:15], 6 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s5 -; GCN-O0-NEXT: s_mov_b32 s12, s14 -; GCN-O0-NEXT: s_mov_b32 s11, s15 -; GCN-O0-NEXT: s_add_u32 s10, s10, s12 -; GCN-O0-NEXT: s_addc_u32 s9, s9, s11 -; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 -; GCN-O0-NEXT: s_mov_b32 s11, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 5 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s5 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 4 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s7, s5 -; GCN-O0-NEXT: s_mov_b32 s10, s12 -; GCN-O0-NEXT: s_mov_b32 s9, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s10 -; GCN-O0-NEXT: s_addc_u32 s7, s7, s9 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[10:11], 3 -; GCN-O0-NEXT: s_mov_b32 s6, s4 -; GCN-O0-NEXT: s_mov_b32 s7, s5 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: s_mov_b32 s8, s11 -; GCN-O0-NEXT: s_add_u32 s6, s6, s9 -; GCN-O0-NEXT: s_addc_u32 s8, s7, s8 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[10:11], 2 -; GCN-O0-NEXT: s_mov_b32 s6, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s8, s10 -; GCN-O0-NEXT: s_mov_b32 s7, s11 -; GCN-O0-NEXT: s_add_u32 s6, s6, s8 -; GCN-O0-NEXT: s_addc_u32 s3, s3, s7 -; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s7, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s2 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s4, s6 -; GCN-O0-NEXT: s_mov_b32 s3, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s4 -; GCN-O0-NEXT: s_addc_u32 s1, s1, s3 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel store <8 x i8> %v, ptr addrspace(1) %out @@ -1315,435 +558,6 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: byte16_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 53 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s0 -; GCN-O0-NEXT: flat_load_ubyte v1, v[1:2] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 54 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s0 -; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 55 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 56 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s0 -; GCN-O0-NEXT: flat_load_ubyte v4, v[4:5] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 57 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s0 -; GCN-O0-NEXT: flat_load_ubyte v5, v[5:6] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 58 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s0 -; GCN-O0-NEXT: flat_load_ubyte v6, v[6:7] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 59 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s0 -; GCN-O0-NEXT: flat_load_ubyte v7, v[7:8] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 60 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s0 -; GCN-O0-NEXT: flat_load_ubyte v8, v[8:9] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 61 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s0 -; GCN-O0-NEXT: flat_load_ubyte v9, v[9:10] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 62 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s0 -; GCN-O0-NEXT: flat_load_ubyte v10, v[10:11] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 63 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s0 -; GCN-O0-NEXT: flat_load_ubyte v11, v[11:12] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s0 -; GCN-O0-NEXT: flat_load_ubyte v12, v[12:13] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x41 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s0 -; GCN-O0-NEXT: flat_load_ubyte v13, v[13:14] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x42 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s0 -; GCN-O0-NEXT: flat_load_ubyte v14, v[14:15] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x43 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s0 -; GCN-O0-NEXT: flat_load_ubyte v15, v[15:16] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 -; GCN-O0-NEXT: s_mov_b32 s3, 15 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v15, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: buffer_store_byte v14, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: buffer_store_byte v13, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: buffer_store_byte v12, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: buffer_store_byte v11, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: buffer_store_byte v10, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: buffer_store_byte v9, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: buffer_store_byte v8, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_store_byte v7, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: buffer_store_byte v6, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: buffer_store_byte v5, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s2 -; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen -; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 -; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[12:15], 0 offset:4 -; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[12:15], 0 offset:5 -; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[12:15], 0 offset:6 -; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[12:15], 0 offset:7 -; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[12:15], 0 offset:8 -; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[12:15], 0 offset:9 -; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[12:15], 0 offset:10 -; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[12:15], 0 offset:11 -; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[12:15], 0 offset:12 -; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[12:15], 0 offset:13 -; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[12:15], 0 offset:14 -; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[12:15], 0 offset:15 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 15 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: flat_store_byte v[0:1], v17 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 14 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v16 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 13 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v15 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 12 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v14 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 11 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v13 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 10 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v12 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 9 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v11 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 8 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v10 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 7 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v9 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 6 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v8 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 5 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v7 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 4 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v6 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 3 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v5 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 2 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v4 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 1 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_byte v[0:1], v3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <16 x i8> %vec, i8 1, i32 %sel store <16 x i8> %v, ptr addrspace(1) %out @@ -1771,32 +585,6 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double2_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v3, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel store <2 x double> %v, ptr addrspace(1) %out @@ -1851,129 +639,6 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double5_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x84 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s10, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s0 -; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s12, s27 -; GCN-O0-NEXT: s_mov_b32 s13, s26 -; GCN-O0-NEXT: s_mov_b32 s14, s25 -; GCN-O0-NEXT: s_mov_b32 s15, s24 -; GCN-O0-NEXT: s_mov_b32 s16, s23 -; GCN-O0-NEXT: s_mov_b32 s17, s22 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: ; implicit-def: $sgpr9 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr8 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr7 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr6 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr1 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr20 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v2, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v19 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v0 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v1, v0 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v5 -; GCN-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23_vgpr24_vgpr25 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v23, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v20 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[20:21], v[22:25] -; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20_vgpr21_vgpr22 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v20, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s2 -; GCN-O0-NEXT: flat_store_dwordx4 v[17:18], v[19:22] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v9 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: s_mov_b32 s2, s5 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel store <5 x double> %v, ptr addrspace(1) %out @@ -2029,112 +694,6 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double8_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0xa4 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v13, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v14, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v15, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v19, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v20, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v21, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v22, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel store <8 x double> %v, ptr addrspace(1) %out @@ -2188,147 +747,6 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double7_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x94 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s6, s1 -; GCN-O0-NEXT: s_mov_b32 s7, s0 -; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x84 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s8, s15 -; GCN-O0-NEXT: s_mov_b32 s9, s14 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s12, s27 -; GCN-O0-NEXT: s_mov_b32 s13, s26 -; GCN-O0-NEXT: s_mov_b32 s14, s25 -; GCN-O0-NEXT: s_mov_b32 s15, s24 -; GCN-O0-NEXT: s_mov_b32 s16, s23 -; GCN-O0-NEXT: s_mov_b32 s17, s22 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: ; implicit-def: $sgpr1 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr20 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v30, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v29, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v28, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v27, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v23, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v0 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4 -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v11 -; GCN-O0-NEXT: ; kill: def $vgpr26 killed $vgpr26 def $vgpr26_vgpr27_vgpr28_vgpr29 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v27, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v24 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v24, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[24:25], v[26:29] -; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v24, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s2 -; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; GCN-O0-NEXT: v_mov_b32_e32 v3, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v19 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 32 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: s_mov_b32 s2, s5 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel store <7 x double> %v, ptr addrspace(1) %out @@ -2426,275 +844,6 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double16_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s2, s51 -; GCN-O0-NEXT: s_mov_b32 s3, s50 -; GCN-O0-NEXT: s_mov_b32 s6, s49 -; GCN-O0-NEXT: s_mov_b32 s7, s48 -; GCN-O0-NEXT: s_mov_b32 s8, s47 -; GCN-O0-NEXT: s_mov_b32 s9, s46 -; GCN-O0-NEXT: s_mov_b32 s10, s45 -; GCN-O0-NEXT: s_mov_b32 s11, s44 -; GCN-O0-NEXT: s_mov_b32 s12, s43 -; GCN-O0-NEXT: s_mov_b32 s13, s42 -; GCN-O0-NEXT: s_mov_b32 s14, s41 -; GCN-O0-NEXT: s_mov_b32 s15, s40 -; GCN-O0-NEXT: s_mov_b32 s16, s39 -; GCN-O0-NEXT: s_mov_b32 s17, s38 -; GCN-O0-NEXT: s_mov_b32 s18, s37 -; GCN-O0-NEXT: s_mov_b32 s19, s36 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s20, s51 -; GCN-O0-NEXT: s_mov_b32 s21, s50 -; GCN-O0-NEXT: s_mov_b32 s22, s49 -; GCN-O0-NEXT: s_mov_b32 s23, s48 -; GCN-O0-NEXT: s_mov_b32 s24, s47 -; GCN-O0-NEXT: s_mov_b32 s25, s46 -; GCN-O0-NEXT: s_mov_b32 s26, s45 -; GCN-O0-NEXT: s_mov_b32 s27, s44 -; GCN-O0-NEXT: s_mov_b32 s28, s43 -; GCN-O0-NEXT: s_mov_b32 s29, s42 -; GCN-O0-NEXT: s_mov_b32 s30, s41 -; GCN-O0-NEXT: s_mov_b32 s31, s40 -; GCN-O0-NEXT: s_mov_b32 s33, s39 -; GCN-O0-NEXT: s_mov_b32 s34, s38 -; GCN-O0-NEXT: s_mov_b32 s35, s37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 -; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 -; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 -; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 -; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 -; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 -; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124 -; GCN-O0-NEXT: s_mov_b32 s3, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v38 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v37 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v35 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v27 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v23 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v19 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v15 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v11 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s2, s0 -; GCN-O0-NEXT: s_mov_b32 s3, s1 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s2, s2, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s3, s4 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: v_mov_b32_e32 v0, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel store <16 x double> %v, ptr addrspace(1) %out @@ -2790,277 +939,6 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: double15_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x114 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s6, s1 -; GCN-O0-NEXT: s_mov_b32 s7, s0 -; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x104 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s8, s15 -; GCN-O0-NEXT: s_mov_b32 s9, s14 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0xe4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s12, s27 -; GCN-O0-NEXT: s_mov_b32 s13, s26 -; GCN-O0-NEXT: s_mov_b32 s14, s25 -; GCN-O0-NEXT: s_mov_b32 s15, s24 -; GCN-O0-NEXT: s_mov_b32 s16, s23 -; GCN-O0-NEXT: s_mov_b32 s17, s22 -; GCN-O0-NEXT: s_mov_b32 s18, s21 -; GCN-O0-NEXT: s_mov_b32 s19, s20 -; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s20, s51 -; GCN-O0-NEXT: s_mov_b32 s21, s50 -; GCN-O0-NEXT: s_mov_b32 s22, s49 -; GCN-O0-NEXT: s_mov_b32 s23, s48 -; GCN-O0-NEXT: s_mov_b32 s24, s47 -; GCN-O0-NEXT: s_mov_b32 s25, s46 -; GCN-O0-NEXT: s_mov_b32 s26, s45 -; GCN-O0-NEXT: s_mov_b32 s27, s44 -; GCN-O0-NEXT: s_mov_b32 s28, s43 -; GCN-O0-NEXT: s_mov_b32 s29, s42 -; GCN-O0-NEXT: s_mov_b32 s30, s41 -; GCN-O0-NEXT: s_mov_b32 s31, s40 -; GCN-O0-NEXT: s_mov_b32 s33, s39 -; GCN-O0-NEXT: s_mov_b32 s34, s38 -; GCN-O0-NEXT: s_mov_b32 s35, s37 -; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 -; GCN-O0-NEXT: ; implicit-def: $sgpr1 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: ; implicit-def: $sgpr37 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s36 -; GCN-O0-NEXT: v_mov_b32_e32 v62, s35 -; GCN-O0-NEXT: v_mov_b32_e32 v61, s34 -; GCN-O0-NEXT: v_mov_b32_e32 v60, s33 -; GCN-O0-NEXT: v_mov_b32_e32 v59, s31 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s30 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s29 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s28 -; GCN-O0-NEXT: v_mov_b32_e32 v55, s27 -; GCN-O0-NEXT: v_mov_b32_e32 v54, s26 -; GCN-O0-NEXT: v_mov_b32_e32 v53, s25 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s24 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s23 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s22 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s21 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s20 -; GCN-O0-NEXT: v_mov_b32_e32 v47, s19 -; GCN-O0-NEXT: v_mov_b32_e32 v46, s18 -; GCN-O0-NEXT: v_mov_b32_e32 v45, s17 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s16 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s15 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s14 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s13 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s12 -; GCN-O0-NEXT: v_mov_b32_e32 v39, s11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v32, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v33, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v34, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v35, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v36, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v37, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v38, v0 -; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x124 -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s1 -; GCN-O0-NEXT: s_mov_b32 m0, s0 -; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v31 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v39, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v40, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v41, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v46, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v42, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v43, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v44, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v45, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v47, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v48, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v49, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v54, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v50, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v51, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v52, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v53, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v55, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v56, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v57, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v62, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v58, v19 -; GCN-O0-NEXT: ; kill: def $vgpr58 killed $vgpr58 def $vgpr58_vgpr59_vgpr60_vgpr61 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v59, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v60, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v61, v56 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 48 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v56, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[56:57], v[58:61] -; GCN-O0-NEXT: ; kill: def $vgpr55 killed $vgpr55 def $vgpr55_vgpr56_vgpr57_vgpr58 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v56, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v57, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v58, v51 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 32 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v52, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v51, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[51:52], v[55:58] -; GCN-O0-NEXT: ; kill: def $vgpr50 killed $vgpr50 def $vgpr50_vgpr51_vgpr52_vgpr53 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v51, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v52, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v53, v48 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 16 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v48, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[48:49], v[50:53] -; GCN-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 def $vgpr47_vgpr48_vgpr49_vgpr50 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v48, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v49, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v50, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v44, s3 -; GCN-O0-NEXT: v_mov_b32_e32 v43, s2 -; GCN-O0-NEXT: flat_store_dwordx4 v[43:44], v[47:50] -; GCN-O0-NEXT: ; kill: def $vgpr42 killed $vgpr42 def $vgpr42_vgpr43_vgpr44_vgpr45 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v43, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v44, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v45, v40 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v40, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[40:41], v[42:45] -; GCN-O0-NEXT: ; kill: def $vgpr39 killed $vgpr39 def $vgpr39_vgpr40_vgpr41_vgpr42 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v40, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v41, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v42, v3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 64 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; GCN-O0-NEXT: v_mov_b32_e32 v3, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v35 -; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v8, v3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s5, s6 -; GCN-O0-NEXT: s_mov_b32 s4, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s5 -; GCN-O0-NEXT: s_addc_u32 s4, s1, s4 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s0 -; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8] -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v0 -; GCN-O0-NEXT: s_mov_b64 s[4:5], 0x60 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s3 -; GCN-O0-NEXT: s_mov_b32 s3, s4 -; GCN-O0-NEXT: s_mov_b32 s2, s5 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel store <15 x double> %v, ptr addrspace(1) %out @@ -3116,63 +994,6 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: bit4_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s14, -1 -; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s11 -; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-O0-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x30 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_bfe_u32 s3, s4, 0x10001 -; GCN-O0-NEXT: s_bfe_u32 s5, s4, 0x20002 -; GCN-O0-NEXT: s_bfe_u32 s6, s4, 0x10003 -; GCN-O0-NEXT: s_mov_b32 s7, 3 -; GCN-O0-NEXT: s_and_b32 s7, s2, s7 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_or_b32 s2, s2, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s6 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s5 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_store_byte v3, v0, s[12:15], 0 offen -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:1 -; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 -; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:3 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, v3, v4 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_mov_b32 s2, 3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: s_mov_b32 s2, 15 -; GCN-O0-NEXT: v_and_b32_e64 v2, v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <4 x i1> %vec, i1 1, i32 %sel store <4 x i1> %v, ptr addrspace(1) %out @@ -4015,1599 +1836,6 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; -; GCN-O0-LABEL: bit128_inselt: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s18, -1 -; GCN-O0-NEXT: s_mov_b32 s19, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s16, s16, s11 -; GCN-O0-NEXT: s_addc_u32 s17, s17, 0 -; GCN-O0-NEXT: s_mov_b64 s[6:7], 52 -; GCN-O0-NEXT: s_mov_b32 s0, s4 -; GCN-O0-NEXT: s_mov_b32 s1, s5 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: s_mov_b32 s2, s7 -; GCN-O0-NEXT: s_add_u32 s0, s0, s3 -; GCN-O0-NEXT: s_addc_u32 s2, s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_mov_b32 s1, 1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:388 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v2, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v3, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v4, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v5, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v6, v0, 6, 1 -; GCN-O0-NEXT: s_mov_b32 s0, 7 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v7, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 53 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v8, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v9, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v10, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v11, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v12, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v13, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v14, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v15, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 54 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v16, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v17, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v18, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v19, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v20, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v21, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v22, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v23, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 55 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v24, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v25, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v26, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v27, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v28, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v29, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v30, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v31, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 56 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v32, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v33, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v34, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v35, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v36, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v37, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v38, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v39, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 57 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v40, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v41, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v42, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v43, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v44, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v45, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v46, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v47, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 58 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v48, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v49, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v50, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v51, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v52, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v53, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v54, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v55, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 59 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v56, v0, s1 -; GCN-O0-NEXT: v_bfe_u32 v57, v0, 1, 1 -; GCN-O0-NEXT: v_bfe_u32 v58, v0, 2, 1 -; GCN-O0-NEXT: v_bfe_u32 v59, v0, 3, 1 -; GCN-O0-NEXT: v_bfe_u32 v60, v0, 4, 1 -; GCN-O0-NEXT: v_bfe_u32 v61, v0, 5, 1 -; GCN-O0-NEXT: v_bfe_u32 v62, v0, 6, 1 -; GCN-O0-NEXT: v_lshrrev_b32_e64 v63, s0, v0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], 60 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:392 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:396 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:400 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:404 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:408 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:412 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:416 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 61 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:424 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:428 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:432 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:436 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:440 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:444 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:448 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 62 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:456 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:460 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:464 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:468 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:472 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:476 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:480 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 63 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:488 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:492 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:496 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:500 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:504 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:508 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:512 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 64 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:520 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:524 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:528 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:532 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:536 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:540 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:544 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x41 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:552 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:556 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:560 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:564 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:568 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:572 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:576 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x42 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:584 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:588 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:592 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:596 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:600 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:604 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:608 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x43 -; GCN-O0-NEXT: s_mov_b32 s2, s4 -; GCN-O0-NEXT: s_mov_b32 s3, s5 -; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_add_u32 s2, s2, s7 -; GCN-O0-NEXT: s_addc_u32 s6, s3, s6 -; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b32 s3, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s3 -; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, s1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 1, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 2, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 3, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 4, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 5, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_bfe_u32 v0, v0, 6, 1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44 -; GCN-O0-NEXT: s_mov_b32 s3, 0x7f -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_and_b32 s3, s2, s3 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_add_i32 s2, s2, s3 -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:127 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:126 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:125 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:124 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:123 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:122 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:121 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:120 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:119 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:608 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:118 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:604 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:117 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:600 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:116 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:596 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:115 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:592 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:114 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:588 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:113 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:584 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:112 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:111 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:576 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:110 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:572 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:109 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:568 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:108 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:564 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:107 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:560 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:106 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:556 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:105 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:552 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:104 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:103 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:544 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:102 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:540 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:101 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:536 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:100 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:532 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:99 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:528 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:98 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:524 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:97 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:520 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:96 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:95 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:512 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:94 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:508 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:93 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:504 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:92 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:500 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:91 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:496 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:90 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:492 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:89 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:488 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:88 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:87 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:480 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:86 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:476 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:85 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:472 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:84 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:468 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:83 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:464 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:82 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:460 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:81 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:456 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:80 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:79 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:448 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:78 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:444 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:77 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:440 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:76 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:436 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:75 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:432 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:74 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:428 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:73 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:424 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:72 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:71 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:416 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:70 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:412 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:69 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:408 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:68 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:404 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:67 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:400 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:66 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:396 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:65 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:392 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:64 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:388 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_store_byte v63, off, s[16:19], 0 offset:63 -; GCN-O0-NEXT: buffer_store_byte v62, off, s[16:19], 0 offset:62 -; GCN-O0-NEXT: buffer_store_byte v61, off, s[16:19], 0 offset:61 -; GCN-O0-NEXT: buffer_store_byte v60, off, s[16:19], 0 offset:60 -; GCN-O0-NEXT: buffer_store_byte v59, off, s[16:19], 0 offset:59 -; GCN-O0-NEXT: buffer_store_byte v58, off, s[16:19], 0 offset:58 -; GCN-O0-NEXT: buffer_store_byte v57, off, s[16:19], 0 offset:57 -; GCN-O0-NEXT: buffer_store_byte v56, off, s[16:19], 0 offset:56 -; GCN-O0-NEXT: buffer_store_byte v55, off, s[16:19], 0 offset:55 -; GCN-O0-NEXT: buffer_store_byte v54, off, s[16:19], 0 offset:54 -; GCN-O0-NEXT: buffer_store_byte v53, off, s[16:19], 0 offset:53 -; GCN-O0-NEXT: buffer_store_byte v52, off, s[16:19], 0 offset:52 -; GCN-O0-NEXT: buffer_store_byte v51, off, s[16:19], 0 offset:51 -; GCN-O0-NEXT: buffer_store_byte v50, off, s[16:19], 0 offset:50 -; GCN-O0-NEXT: buffer_store_byte v49, off, s[16:19], 0 offset:49 -; GCN-O0-NEXT: buffer_store_byte v48, off, s[16:19], 0 offset:48 -; GCN-O0-NEXT: buffer_store_byte v47, off, s[16:19], 0 offset:47 -; GCN-O0-NEXT: buffer_store_byte v46, off, s[16:19], 0 offset:46 -; GCN-O0-NEXT: buffer_store_byte v45, off, s[16:19], 0 offset:45 -; GCN-O0-NEXT: buffer_store_byte v44, off, s[16:19], 0 offset:44 -; GCN-O0-NEXT: buffer_store_byte v43, off, s[16:19], 0 offset:43 -; GCN-O0-NEXT: buffer_store_byte v42, off, s[16:19], 0 offset:42 -; GCN-O0-NEXT: buffer_store_byte v41, off, s[16:19], 0 offset:41 -; GCN-O0-NEXT: buffer_store_byte v40, off, s[16:19], 0 offset:40 -; GCN-O0-NEXT: buffer_store_byte v39, off, s[16:19], 0 offset:39 -; GCN-O0-NEXT: buffer_store_byte v38, off, s[16:19], 0 offset:38 -; GCN-O0-NEXT: buffer_store_byte v37, off, s[16:19], 0 offset:37 -; GCN-O0-NEXT: buffer_store_byte v36, off, s[16:19], 0 offset:36 -; GCN-O0-NEXT: buffer_store_byte v35, off, s[16:19], 0 offset:35 -; GCN-O0-NEXT: buffer_store_byte v34, off, s[16:19], 0 offset:34 -; GCN-O0-NEXT: buffer_store_byte v33, off, s[16:19], 0 offset:33 -; GCN-O0-NEXT: buffer_store_byte v32, off, s[16:19], 0 offset:32 -; GCN-O0-NEXT: buffer_store_byte v31, off, s[16:19], 0 offset:31 -; GCN-O0-NEXT: buffer_store_byte v30, off, s[16:19], 0 offset:30 -; GCN-O0-NEXT: buffer_store_byte v29, off, s[16:19], 0 offset:29 -; GCN-O0-NEXT: buffer_store_byte v28, off, s[16:19], 0 offset:28 -; GCN-O0-NEXT: buffer_store_byte v27, off, s[16:19], 0 offset:27 -; GCN-O0-NEXT: buffer_store_byte v26, off, s[16:19], 0 offset:26 -; GCN-O0-NEXT: buffer_store_byte v25, off, s[16:19], 0 offset:25 -; GCN-O0-NEXT: buffer_store_byte v24, off, s[16:19], 0 offset:24 -; GCN-O0-NEXT: buffer_store_byte v23, off, s[16:19], 0 offset:23 -; GCN-O0-NEXT: buffer_store_byte v22, off, s[16:19], 0 offset:22 -; GCN-O0-NEXT: buffer_store_byte v21, off, s[16:19], 0 offset:21 -; GCN-O0-NEXT: buffer_store_byte v20, off, s[16:19], 0 offset:20 -; GCN-O0-NEXT: buffer_store_byte v19, off, s[16:19], 0 offset:19 -; GCN-O0-NEXT: buffer_store_byte v18, off, s[16:19], 0 offset:18 -; GCN-O0-NEXT: buffer_store_byte v17, off, s[16:19], 0 offset:17 -; GCN-O0-NEXT: buffer_store_byte v16, off, s[16:19], 0 offset:16 -; GCN-O0-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15 -; GCN-O0-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14 -; GCN-O0-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13 -; GCN-O0-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12 -; GCN-O0-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11 -; GCN-O0-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10 -; GCN-O0-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9 -; GCN-O0-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8 -; GCN-O0-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7 -; GCN-O0-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6 -; GCN-O0-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5 -; GCN-O0-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4 -; GCN-O0-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3 -; GCN-O0-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2 -; GCN-O0-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1 -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 -; GCN-O0-NEXT: v_mov_b32_e32 v3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s2 -; GCN-O0-NEXT: buffer_store_byte v3, v0, s[16:19], 0 offen -; GCN-O0-NEXT: buffer_load_ubyte v18, off, s[16:19], 0 offset:23 -; GCN-O0-NEXT: buffer_load_ubyte v19, off, s[16:19], 0 offset:22 -; GCN-O0-NEXT: buffer_load_ubyte v20, off, s[16:19], 0 offset:21 -; GCN-O0-NEXT: buffer_load_ubyte v21, off, s[16:19], 0 offset:20 -; GCN-O0-NEXT: buffer_load_ubyte v22, off, s[16:19], 0 offset:19 -; GCN-O0-NEXT: buffer_load_ubyte v23, off, s[16:19], 0 offset:18 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:1 -; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:2 -; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:3 -; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:4 -; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:5 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:6 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:7 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:8 -; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[16:19], 0 offset:9 -; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:10 -; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:11 -; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:12 -; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:13 -; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:14 -; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:15 -; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[16:19], 0 offset:16 -; GCN-O0-NEXT: buffer_load_ubyte v24, off, s[16:19], 0 offset:17 -; GCN-O0-NEXT: buffer_load_ubyte v26, off, s[16:19], 0 offset:31 -; GCN-O0-NEXT: buffer_load_ubyte v27, off, s[16:19], 0 offset:30 -; GCN-O0-NEXT: buffer_load_ubyte v28, off, s[16:19], 0 offset:29 -; GCN-O0-NEXT: buffer_load_ubyte v29, off, s[16:19], 0 offset:28 -; GCN-O0-NEXT: buffer_load_ubyte v30, off, s[16:19], 0 offset:27 -; GCN-O0-NEXT: buffer_load_ubyte v31, off, s[16:19], 0 offset:26 -; GCN-O0-NEXT: buffer_load_ubyte v32, off, s[16:19], 0 offset:25 -; GCN-O0-NEXT: buffer_load_ubyte v25, off, s[16:19], 0 offset:24 -; GCN-O0-NEXT: buffer_load_ubyte v34, off, s[16:19], 0 offset:39 -; GCN-O0-NEXT: buffer_load_ubyte v35, off, s[16:19], 0 offset:38 -; GCN-O0-NEXT: buffer_load_ubyte v36, off, s[16:19], 0 offset:37 -; GCN-O0-NEXT: buffer_load_ubyte v37, off, s[16:19], 0 offset:36 -; GCN-O0-NEXT: buffer_load_ubyte v38, off, s[16:19], 0 offset:35 -; GCN-O0-NEXT: buffer_load_ubyte v39, off, s[16:19], 0 offset:34 -; GCN-O0-NEXT: buffer_load_ubyte v40, off, s[16:19], 0 offset:33 -; GCN-O0-NEXT: buffer_load_ubyte v33, off, s[16:19], 0 offset:32 -; GCN-O0-NEXT: buffer_load_ubyte v42, off, s[16:19], 0 offset:47 -; GCN-O0-NEXT: buffer_load_ubyte v43, off, s[16:19], 0 offset:46 -; GCN-O0-NEXT: buffer_load_ubyte v44, off, s[16:19], 0 offset:45 -; GCN-O0-NEXT: buffer_load_ubyte v45, off, s[16:19], 0 offset:44 -; GCN-O0-NEXT: buffer_load_ubyte v46, off, s[16:19], 0 offset:43 -; GCN-O0-NEXT: buffer_load_ubyte v47, off, s[16:19], 0 offset:42 -; GCN-O0-NEXT: buffer_load_ubyte v48, off, s[16:19], 0 offset:41 -; GCN-O0-NEXT: buffer_load_ubyte v41, off, s[16:19], 0 offset:40 -; GCN-O0-NEXT: buffer_load_ubyte v50, off, s[16:19], 0 offset:55 -; GCN-O0-NEXT: buffer_load_ubyte v51, off, s[16:19], 0 offset:54 -; GCN-O0-NEXT: buffer_load_ubyte v52, off, s[16:19], 0 offset:53 -; GCN-O0-NEXT: buffer_load_ubyte v53, off, s[16:19], 0 offset:52 -; GCN-O0-NEXT: buffer_load_ubyte v54, off, s[16:19], 0 offset:51 -; GCN-O0-NEXT: buffer_load_ubyte v55, off, s[16:19], 0 offset:50 -; GCN-O0-NEXT: buffer_load_ubyte v56, off, s[16:19], 0 offset:49 -; GCN-O0-NEXT: buffer_load_ubyte v49, off, s[16:19], 0 offset:48 -; GCN-O0-NEXT: buffer_load_ubyte v58, off, s[16:19], 0 offset:63 -; GCN-O0-NEXT: buffer_load_ubyte v59, off, s[16:19], 0 offset:62 -; GCN-O0-NEXT: buffer_load_ubyte v60, off, s[16:19], 0 offset:61 -; GCN-O0-NEXT: buffer_load_ubyte v61, off, s[16:19], 0 offset:60 -; GCN-O0-NEXT: buffer_load_ubyte v62, off, s[16:19], 0 offset:59 -; GCN-O0-NEXT: buffer_load_ubyte v63, off, s[16:19], 0 offset:58 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:57 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v57, off, s[16:19], 0 offset:56 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:71 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:70 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:69 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:68 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:67 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:66 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:65 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:64 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:79 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:78 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:77 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:76 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:75 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:74 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:73 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:72 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:87 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:86 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:85 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:84 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:83 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:82 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:81 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:80 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:95 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:94 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:268 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:93 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:92 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:91 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:90 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:89 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:88 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:103 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:272 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:102 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:300 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:101 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:100 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:99 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:284 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:98 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:288 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:97 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:296 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:96 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:111 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:304 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:110 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:332 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:109 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:308 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:108 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:312 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:107 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:316 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:106 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:320 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:105 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:328 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:104 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:119 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:336 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:118 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:364 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:117 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:340 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:116 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:344 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:115 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:348 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:114 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:352 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:113 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:360 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:112 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:127 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:368 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:126 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:125 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:372 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:124 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:376 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:123 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:380 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:122 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:384 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:121 -; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:120 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:384 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s7, 2 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:380 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s6, 3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:376 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s5, 4 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:372 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: s_mov_b32 s4, 5 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:368 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: s_mov_b32 s3, 6 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_mov_b32 s2, 7 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 15 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:364 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:360 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:352 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:348 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:344 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:340 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:336 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 14 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:332 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:328 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:320 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:316 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:312 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:308 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:304 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 13 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:300 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:296 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:288 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:284 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 12 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:264 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:256 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:252 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:248 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:244 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:240 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 11 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:236 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:232 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:224 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:220 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:216 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:212 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 10 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 9 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 8 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v57, v57, v3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v0, v3, v0 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v0 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_and_b32_e64 v63, v63, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v63, s7, v63 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v63 -; GCN-O0-NEXT: v_and_b32_e64 v62, v62, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v62, s6, v62 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v62 -; GCN-O0-NEXT: v_and_b32_e64 v61, v61, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v61, s5, v61 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v61 -; GCN-O0-NEXT: v_and_b32_e64 v60, v60, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v60, s4, v60 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v60 -; GCN-O0-NEXT: v_and_b32_e64 v59, v59, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v59, s3, v59 -; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v59 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v58, s2, v58 -; GCN-O0-NEXT: v_or_b32_e64 v59, v57, v58 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 7 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v58, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v57, s8 -; GCN-O0-NEXT: flat_store_byte v[57:58], v59 -; GCN-O0-NEXT: v_and_b32_e64 v49, v49, v3 -; GCN-O0-NEXT: v_and_b32_e64 v56, v56, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v56, v3, v56 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v56 -; GCN-O0-NEXT: v_and_b32_e64 v55, v55, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v55, s7, v55 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v55 -; GCN-O0-NEXT: v_and_b32_e64 v54, v54, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v54, s6, v54 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v54 -; GCN-O0-NEXT: v_and_b32_e64 v53, v53, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v53, s5, v53 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v53 -; GCN-O0-NEXT: v_and_b32_e64 v52, v52, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v52, s4, v52 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v52 -; GCN-O0-NEXT: v_and_b32_e64 v51, v51, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v51, s3, v51 -; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v51 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v50, s2, v50 -; GCN-O0-NEXT: v_or_b32_e64 v51, v49, v50 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 6 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v50, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v49, s8 -; GCN-O0-NEXT: flat_store_byte v[49:50], v51 -; GCN-O0-NEXT: v_and_b32_e64 v41, v41, v3 -; GCN-O0-NEXT: v_and_b32_e64 v48, v48, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v48, v3, v48 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v48 -; GCN-O0-NEXT: v_and_b32_e64 v47, v47, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v47, s7, v47 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v47 -; GCN-O0-NEXT: v_and_b32_e64 v46, v46, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v46, s6, v46 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v46 -; GCN-O0-NEXT: v_and_b32_e64 v45, v45, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v45, s5, v45 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v45 -; GCN-O0-NEXT: v_and_b32_e64 v44, v44, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v44, s4, v44 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v44 -; GCN-O0-NEXT: v_and_b32_e64 v43, v43, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v43, s3, v43 -; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v43 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v42, s2, v42 -; GCN-O0-NEXT: v_or_b32_e64 v43, v41, v42 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 5 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v42, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v41, s8 -; GCN-O0-NEXT: flat_store_byte v[41:42], v43 -; GCN-O0-NEXT: v_and_b32_e64 v33, v33, v3 -; GCN-O0-NEXT: v_and_b32_e64 v40, v40, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v40, v3, v40 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v40 -; GCN-O0-NEXT: v_and_b32_e64 v39, v39, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v39, s7, v39 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v39 -; GCN-O0-NEXT: v_and_b32_e64 v38, v38, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v38, s6, v38 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v38 -; GCN-O0-NEXT: v_and_b32_e64 v37, v37, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v37, s5, v37 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v37 -; GCN-O0-NEXT: v_and_b32_e64 v36, v36, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v36, s4, v36 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v36 -; GCN-O0-NEXT: v_and_b32_e64 v35, v35, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v35, s3, v35 -; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v35 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v34, s2, v34 -; GCN-O0-NEXT: v_or_b32_e64 v35, v33, v34 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 4 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v34, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v33, s8 -; GCN-O0-NEXT: flat_store_byte v[33:34], v35 -; GCN-O0-NEXT: v_and_b32_e64 v25, v25, v3 -; GCN-O0-NEXT: v_and_b32_e64 v32, v32, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v32, v3, v32 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v32 -; GCN-O0-NEXT: v_and_b32_e64 v31, v31, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v31, s7, v31 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v31 -; GCN-O0-NEXT: v_and_b32_e64 v30, v30, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v30, s6, v30 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v30 -; GCN-O0-NEXT: v_and_b32_e64 v29, v29, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v29, s5, v29 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v29 -; GCN-O0-NEXT: v_and_b32_e64 v28, v28, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v28, s4, v28 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v28 -; GCN-O0-NEXT: v_and_b32_e64 v27, v27, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v27, s3, v27 -; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v27 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v26, s2, v26 -; GCN-O0-NEXT: v_or_b32_e64 v27, v25, v26 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 3 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v26, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v25, s8 -; GCN-O0-NEXT: flat_store_byte v[25:26], v27 -; GCN-O0-NEXT: v_and_b32_e64 v17, v17, v3 -; GCN-O0-NEXT: v_and_b32_e64 v24, v24, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v24, v3, v24 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v24 -; GCN-O0-NEXT: v_and_b32_e64 v23, v23, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v23, s7, v23 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v23 -; GCN-O0-NEXT: v_and_b32_e64 v22, v22, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v22, s6, v22 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v22 -; GCN-O0-NEXT: v_and_b32_e64 v21, v21, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v21, s5, v21 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v21 -; GCN-O0-NEXT: v_and_b32_e64 v20, v20, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v20, s4, v20 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v20 -; GCN-O0-NEXT: v_and_b32_e64 v19, v19, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v19, s3, v19 -; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v19 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v18, s2, v18 -; GCN-O0-NEXT: v_or_b32_e64 v19, v17, v18 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 2 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v18, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v17, s8 -; GCN-O0-NEXT: flat_store_byte v[17:18], v19 -; GCN-O0-NEXT: v_and_b32_e64 v9, v9, v3 -; GCN-O0-NEXT: v_and_b32_e64 v16, v16, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v16, v3, v16 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v16 -; GCN-O0-NEXT: v_and_b32_e64 v15, v15, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v15, s7, v15 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v15 -; GCN-O0-NEXT: v_and_b32_e64 v14, v14, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v14, s6, v14 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v14 -; GCN-O0-NEXT: v_and_b32_e64 v13, v13, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v13, s5, v13 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v13 -; GCN-O0-NEXT: v_and_b32_e64 v12, v12, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v12, s4, v12 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v12 -; GCN-O0-NEXT: v_and_b32_e64 v11, v11, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v11, s3, v11 -; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v11 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v10, s2, v10 -; GCN-O0-NEXT: v_or_b32_e64 v11, v9, v10 -; GCN-O0-NEXT: s_mov_b64 s[12:13], 1 -; GCN-O0-NEXT: s_mov_b32 s8, s0 -; GCN-O0-NEXT: s_mov_b32 s9, s1 -; GCN-O0-NEXT: s_mov_b32 s11, s12 -; GCN-O0-NEXT: s_mov_b32 s10, s13 -; GCN-O0-NEXT: s_add_u32 s8, s8, s11 -; GCN-O0-NEXT: s_addc_u32 s10, s9, s10 -; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s9, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 -; GCN-O0-NEXT: flat_store_byte v[9:10], v11 -; GCN-O0-NEXT: s_waitcnt vmcnt(7) -; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3 -; GCN-O0-NEXT: v_and_b32_e64 v8, v8, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v8, v3, v8 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v8 -; GCN-O0-NEXT: v_and_b32_e64 v7, v7, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v7, s7, v7 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v7 -; GCN-O0-NEXT: v_and_b32_e64 v6, v6, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v6, s6, v6 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v6 -; GCN-O0-NEXT: v_and_b32_e64 v5, v5, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v5, s5, v5 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v5 -; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, s4, v4 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4 -; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2 -; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2 -; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1 -; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s0 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s1 -; GCN-O0-NEXT: flat_store_byte v[0:1], v2 -; GCN-O0-NEXT: s_endpgm entry: %v = insertelement <128 x i1> %vec, i1 1, i32 %sel store <128 x i1> %v, ptr addrspace(1) %out @@ -5682,361 +1910,6 @@ define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) { ; GCN-NEXT: v_cndmask_b32_e64 v30, 1.0, v30, s[58:59] ; GCN-NEXT: v_cndmask_b32_e64 v31, 1.0, v31, s[60:61] ; GCN-NEXT: ; return to shader part epilog -; -; GCN-O0-LABEL: float32_inselt_vec: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-O0-NEXT: s_mov_b32 s10, -1 -; GCN-O0-NEXT: s_mov_b32 s11, 0xe80000 -; GCN-O0-NEXT: s_add_u32 s8, s8, s0 -; GCN-O0-NEXT: s_addc_u32 s9, s9, 0 -; GCN-O0-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_mov_b32_e32 v32, v31 -; GCN-O0-NEXT: v_mov_b32_e32 v33, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v34, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v35, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v36, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v37, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v38, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v39, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v40, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v41, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v42, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v43, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v44, v19 -; GCN-O0-NEXT: v_mov_b32_e32 v45, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v46, v17 -; GCN-O0-NEXT: v_mov_b32_e32 v47, v16 -; GCN-O0-NEXT: v_mov_b32_e32 v48, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v49, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v50, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v51, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v52, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v53, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v54, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v55, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v56, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v57, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v58, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v59, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v60, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v61, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v62, v1 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v62 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v61 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v60 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v59 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v58 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v57 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v56 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v55 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v54 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v53 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v52 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v51 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v50 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v49 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v48 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v38 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v37 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v35 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v32 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: v_mov_b32_e32 v32, 1.0 -; GCN-O0-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr64 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v64, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v64, s1, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v64, off, s[8:11], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 -; GCN-O0-NEXT: .LBB22_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v64, off, s[8:11], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v64, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v64, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s2, v33 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v33 -; GCN-O0-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GCN-O0-NEXT: s_mov_b32 m0, s2 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v32 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v64, s2, 2 -; GCN-O0-NEXT: v_writelane_b32 v64, s3, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v64, off, s[8:11], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execnz .LBB22_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v64, off, s[8:11], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v64, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v64, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v36, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v40, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v44, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v48, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v52, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v56, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v60, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v31 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v32 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v33 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v34 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v35 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v36 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v37 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v38 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v39 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v40 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v41 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v42 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v43 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v44 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v45 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v46 -; GCN-O0-NEXT: v_mov_b32_e32 v16, v47 -; GCN-O0-NEXT: v_mov_b32_e32 v17, v48 -; GCN-O0-NEXT: s_waitcnt vmcnt(13) -; GCN-O0-NEXT: v_mov_b32_e32 v18, v49 -; GCN-O0-NEXT: s_waitcnt vmcnt(12) -; GCN-O0-NEXT: v_mov_b32_e32 v19, v50 -; GCN-O0-NEXT: s_waitcnt vmcnt(11) -; GCN-O0-NEXT: v_mov_b32_e32 v20, v51 -; GCN-O0-NEXT: s_waitcnt vmcnt(10) -; GCN-O0-NEXT: v_mov_b32_e32 v21, v52 -; GCN-O0-NEXT: s_waitcnt vmcnt(9) -; GCN-O0-NEXT: v_mov_b32_e32 v22, v53 -; GCN-O0-NEXT: s_waitcnt vmcnt(8) -; GCN-O0-NEXT: v_mov_b32_e32 v23, v54 -; GCN-O0-NEXT: s_waitcnt vmcnt(7) -; GCN-O0-NEXT: v_mov_b32_e32 v24, v55 -; GCN-O0-NEXT: s_waitcnt vmcnt(6) -; GCN-O0-NEXT: v_mov_b32_e32 v25, v56 -; GCN-O0-NEXT: s_waitcnt vmcnt(5) -; GCN-O0-NEXT: v_mov_b32_e32 v26, v57 -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v27, v58 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v28, v59 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v29, v60 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v30, v61 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v31, v62 -; GCN-O0-NEXT: ; return to shader part epilog entry: %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel ret <32 x float> %v @@ -6072,1843 +1945,7 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { ; GCN-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc ; GCN-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: double8_inselt_vec: -; GCN-O0: ; %bb.0: ; %entry -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: v_mov_b32_e32 v17, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v18, v14 -; GCN-O0-NEXT: v_mov_b32_e32 v19, v13 -; GCN-O0-NEXT: v_mov_b32_e32 v20, v12 -; GCN-O0-NEXT: v_mov_b32_e32 v21, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v22, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v23, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v24, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v25, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v26, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v27, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v28, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v29, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v30, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v31, v1 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v31 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v30 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v29 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v28 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v27 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v26 -; GCN-O0-NEXT: v_mov_b32_e32 v7, v25 -; GCN-O0-NEXT: v_mov_b32_e32 v8, v24 -; GCN-O0-NEXT: v_mov_b32_e32 v9, v23 -; GCN-O0-NEXT: v_mov_b32_e32 v10, v22 -; GCN-O0-NEXT: v_mov_b32_e32 v11, v21 -; GCN-O0-NEXT: v_mov_b32_e32 v12, v20 -; GCN-O0-NEXT: v_mov_b32_e32 v13, v19 -; GCN-O0-NEXT: v_mov_b32_e32 v14, v18 -; GCN-O0-NEXT: v_mov_b32_e32 v15, v17 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, 1 -; GCN-O0-NEXT: v_lshlrev_b32_e64 v16, s4, v16 -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0 -; GCN-O0-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v33, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v33, s5, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s4 -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v33, s4, 2 -; GCN-O0-NEXT: v_writelane_b32 v33, s5, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB23_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v33, 4 -; GCN-O0-NEXT: v_readlane_b32 s5, v33, 5 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v16 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v33, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v33, s7, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB23_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v33, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v33, 3 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v33, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v33, 1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s4, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v16, s4 -; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v33, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v33, s5, 7 -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB23_4: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v33, 8 -; GCN-O0-NEXT: v_readlane_b32 s5, v33, 9 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v33, s6, 8 -; GCN-O0-NEXT: v_writelane_b32 v33, s7, 9 -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-O0-NEXT: ; %bb.5: -; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[10:11] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v33, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v33, 7 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.6: -; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(14) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v15 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v16 -; GCN-O0-NEXT: s_waitcnt vmcnt(13) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v17 -; GCN-O0-NEXT: s_waitcnt vmcnt(12) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v18 -; GCN-O0-NEXT: s_waitcnt vmcnt(11) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v19 -; GCN-O0-NEXT: s_waitcnt vmcnt(10) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v20 -; GCN-O0-NEXT: s_waitcnt vmcnt(9) -; GCN-O0-NEXT: v_mov_b32_e32 v6, v21 -; GCN-O0-NEXT: s_waitcnt vmcnt(8) -; GCN-O0-NEXT: v_mov_b32_e32 v7, v22 -; GCN-O0-NEXT: s_waitcnt vmcnt(7) -; GCN-O0-NEXT: v_mov_b32_e32 v8, v23 -; GCN-O0-NEXT: s_waitcnt vmcnt(6) -; GCN-O0-NEXT: v_mov_b32_e32 v9, v24 -; GCN-O0-NEXT: s_waitcnt vmcnt(5) -; GCN-O0-NEXT: v_mov_b32_e32 v10, v25 -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v11, v26 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v12, v27 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v13, v28 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v14, v29 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v15, v30 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] entry: %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel ret <8 x double> %v } - -define <3 x i32> @insert_dyn_i32_3(<3 x i32> inreg %arg, i32 %idx, i32 %val) { -; GCN-LABEL: insert_dyn_i32_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_i32_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v4 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v4 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v3 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB24_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <3 x i32> %arg, i32 %val, i32 %idx - ret <3 x i32> %x -} - -define <3 x i32> @insert_dyn_inreg_i32_3(<3 x i32> inreg %arg, i32 inreg %idx, i32 %val) { -; GCN-LABEL: insert_dyn_inreg_i32_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s19, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s19, 1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s19, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_i32_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 -; GCN-O0-NEXT: s_mov_b32 m0, s19 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <3 x i32> %arg, i32 %val, i32 %idx - ret <3 x i32> %x -} - -define <3 x float> @insert_dyn_float_3(<3 x float> inreg %arg, i32 %idx, float %val) { -; GCN-LABEL: insert_dyn_float_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_float_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v4 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v4 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v3 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB26_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <3 x float> %arg, float %val, i32 %idx - ret <3 x float> %x -} - -define <3 x float> @insert_dyn_inreg_float_3(<3 x float> inreg %arg, i32 inreg %idx, float %val) { -; GCN-LABEL: insert_dyn_inreg_float_3: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s19, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s19, 1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s19, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_float_3: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 -; GCN-O0-NEXT: s_mov_b32 m0, s19 -; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v3 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v4 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <3 x float> %arg, float %val, i32 %idx - ret <3 x float> %x -} - -define <5 x i32> @insert_dyn_i32_5(<5 x i32> inreg %arg, i32 %idx, i32 %val) { -; GCN-LABEL: insert_dyn_i32_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: v_mov_b32_e32 v1, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_i32_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v5 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB28_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <5 x i32> %arg, i32 %val, i32 %idx - ret <5 x i32> %x -} - -define <5 x i32> @insert_dyn_inreg_i32_5(<5 x i32> inreg %arg, i32 inreg %idx, i32 %val) { -; GCN-LABEL: insert_dyn_inreg_i32_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s21, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_i32_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s8 -; GCN-O0-NEXT: s_mov_b32 m0, s21 -; GCN-O0-NEXT: v_movreld_b32_e32 v4, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <5 x i32> %arg, i32 %val, i32 %idx - ret <5 x i32> %x -} - -define <5 x float> @insert_dyn_float_5(<5 x float> inreg %arg, i32 %idx, float %val) { -; GCN-LABEL: insert_dyn_float_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: v_mov_b32_e32 v1, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_float_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB30_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v5 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB30_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <5 x float> %arg, float %val, i32 %idx - ret <5 x float> %x -} - -define <5 x float> @insert_dyn_inreg_float_5(<5 x float> inreg %arg, i32 inreg %idx, float %val) { -; GCN-LABEL: insert_dyn_inreg_float_5: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s21, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s21, 4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_float_5: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s8 -; GCN-O0-NEXT: s_mov_b32 m0, s21 -; GCN-O0-NEXT: v_movreld_b32_e32 v4, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v8 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <5 x float> %arg, float %val, i32 %idx - ret <5 x float> %x -} - -define <6 x i32> @insert_dyn_i32_6(<6 x i32> inreg %arg, i32 %idx, i32 %val) { -; GCN-LABEL: insert_dyn_i32_6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: v_mov_b32_e32 v1, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_i32_6: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v11, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v11, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB32_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v11, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v11, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v6 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v11, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v11, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB32_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v11, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v11, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(5) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <6 x i32> %arg, i32 %val, i32 %idx - ret <6 x i32> %x -} - -define <6 x i32> @insert_dyn_inreg_i32_6(<6 x i32> inreg %arg, i32 inreg %idx, i32 %val) { -; GCN-LABEL: insert_dyn_inreg_i32_6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s22, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 1 -; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_i32_6: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 -; GCN-O0-NEXT: s_mov_b32 m0, s22 -; GCN-O0-NEXT: v_movreld_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <6 x i32> %arg, i32 %val, i32 %idx - ret <6 x i32> %x -} - -define <6 x float> @insert_dyn_float_6(<6 x float> inreg %arg, i32 %idx, float %val) { -; GCN-LABEL: insert_dyn_float_6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: v_mov_b32_e32 v1, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_float_6: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v11, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v11, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB34_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v11, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v11, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v6 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v11, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v11, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB34_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[22:23] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v11, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v11, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(5) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <6 x float> %arg, float %val, i32 %idx - ret <6 x float> %x -} - -define <6 x float> @insert_dyn_inreg_float_6(<6 x float> inreg %arg, i32 inreg %idx, float %val) { -; GCN-LABEL: insert_dyn_inreg_float_6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s22, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 1 -; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s22, 5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_float_6: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s4 -; GCN-O0-NEXT: s_mov_b32 m0, s22 -; GCN-O0-NEXT: v_movreld_b32_e32 v5, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v10 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <6 x float> %arg, float %val, i32 %idx - ret <6 x float> %x -} - -define <7 x i32> @insert_dyn_i32_7(<7 x i32> inreg %arg, i32 %idx, i32 %val) { -; GCN-LABEL: insert_dyn_i32_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v8, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v1, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_i32_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v13, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v13, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB36_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v13, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v13, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v8 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v8 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v7 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v13, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v13, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB36_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v13, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v13, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(6) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(5) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <7 x i32> %arg, i32 %val, i32 %idx - ret <7 x i32> %x -} - -define <7 x i32> @insert_dyn_inreg_i32_7(<7 x i32> inreg %arg, i32 inreg %idx, i32 %val) { -; GCN-LABEL: insert_dyn_inreg_i32_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s23, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 1 -; GCN-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 6 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_i32_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: s_mov_b32 m0, s23 -; GCN-O0-NEXT: v_movreld_b32_e32 v6, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <7 x i32> %arg, i32 %val, i32 %idx - ret <7 x i32> %x -} - -define <7 x float> @insert_dyn_float_7(<7 x float> inreg %arg, i32 %idx, float %val) { -; GCN-LABEL: insert_dyn_float_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v8, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v1, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_float_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s10 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane -; GCN-O0-NEXT: v_writelane_b32 v13, s4, 0 -; GCN-O0-NEXT: v_writelane_b32 v13, s5, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-O0-NEXT: .LBB38_1: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v13, 2 -; GCN-O0-NEXT: v_readlane_b32 s5, v13, 3 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readfirstlane_b32 s6, v8 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v8 -; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GCN-O0-NEXT: s_mov_b32 m0, s6 -; GCN-O0-NEXT: v_movreld_b32_e32 v0, v7 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v13, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v13, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB38_1 -; GCN-O0-NEXT: ; %bb.2: -; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[26:27] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v13, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v13, 1 -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; %bb.3: -; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(6) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 -; GCN-O0-NEXT: s_waitcnt vmcnt(5) -; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 -; GCN-O0-NEXT: s_waitcnt vmcnt(4) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 -; GCN-O0-NEXT: s_waitcnt vmcnt(3) -; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 -; GCN-O0-NEXT: s_waitcnt vmcnt(2) -; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <7 x float> %arg, float %val, i32 %idx - ret <7 x float> %x -} - -define <7 x float> @insert_dyn_inreg_float_7(<7 x float> inreg %arg, i32 inreg %idx, float %val) { -; GCN-LABEL: insert_dyn_inreg_float_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s23, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 1 -; GCN-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 2 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s23, 6 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: insert_dyn_inreg_float_7: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s4, s16 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: s_mov_b32 s5, s17 -; GCN-O0-NEXT: s_mov_b32 s6, s18 -; GCN-O0-NEXT: s_mov_b32 s7, s19 -; GCN-O0-NEXT: s_mov_b32 s8, s20 -; GCN-O0-NEXT: s_mov_b32 s9, s21 -; GCN-O0-NEXT: s_mov_b32 s10, s22 -; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 -; GCN-O0-NEXT: v_mov_b32_e32 v12, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v11, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v10, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v9, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v8, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v7, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: s_mov_b32 m0, s23 -; GCN-O0-NEXT: v_movreld_b32_e32 v6, v0 -; GCN-O0-NEXT: v_mov_b32_e32 v0, v6 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v7 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, v10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, v11 -; GCN-O0-NEXT: v_mov_b32_e32 v6, v12 -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %x = insertelement <7 x float> %arg, float %val, i32 %idx - ret <7 x float> %x -} From 0b522d9e8e450a38fea1ac14b7ec67f682f43725 Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Thu, 11 Dec 2025 10:16:01 +0000 Subject: [PATCH 42/49] Revert "[lldb] fix failing tests due to CI diagnostics rendering (#171791) --- lldb/include/lldb/Host/Terminal.h | 12 ------------ .../lldb/Host/common/DiagnosticsRendering.h | 19 +------------------ .../Host/common/DiagnosticsRendering.cpp | 11 ++++++----- lldb/source/Host/common/Terminal.cpp | 16 ---------------- .../Shell/Commands/command-dwim-print.test | 6 +++--- .../Commands/command-expr-diagnostics.test | 10 +++++----- lldb/test/Shell/Commands/command-options.test | 6 +++--- .../Host/common/DiagnosticsRenderingTest.cpp | 2 +- 8 files changed, 19 insertions(+), 63 deletions(-) diff --git a/lldb/include/lldb/Host/Terminal.h b/lldb/include/lldb/Host/Terminal.h index 3d66515c18812..da0d05e8bd265 100644 --- a/lldb/include/lldb/Host/Terminal.h +++ b/lldb/include/lldb/Host/Terminal.h @@ -68,18 +68,6 @@ class Terminal { llvm::Error SetHardwareFlowControl(bool enabled); - /// Returns whether or not the current terminal supports Unicode rendering. - /// - /// The value is cached after the first computation. - /// - /// On POSIX systems, we check if the LANG environment variable contains the - /// substring "UTF-8", case insensitive. - /// - /// On Windows, we always return true since we use the `WriteConsoleW` API - /// internally. Note that the default Windows codepage (437) does not support - /// all Unicode characters. This function does not check the codepage. - static bool SupportsUnicode(); - protected: struct Data; diff --git a/lldb/include/lldb/Host/common/DiagnosticsRendering.h b/lldb/include/lldb/Host/common/DiagnosticsRendering.h index 3eea0647da37e..dd33d671c24a5 100644 --- a/lldb/include/lldb/Host/common/DiagnosticsRendering.h +++ b/lldb/include/lldb/Host/common/DiagnosticsRendering.h @@ -59,27 +59,10 @@ struct DiagnosticDetail { StructuredData::ObjectSP Serialize(llvm::ArrayRef details); -/// Renders an array of DiagnosticDetail instances. -/// -/// \param[in] stream -/// The stream to render the diagnostics to. -/// \param offset_in_command -/// An optional offset to the column position of the diagnostic in the -/// source. -/// \param show_inline -/// Whether to show the diagnostics inline. -/// \param details -/// The array of DiagnosticsDetail to render. -/// \param force_ascii -/// Whether to force ascii rendering. If false, Unicode characters will be -/// used if the output file supports them. -/// -/// \see lldb_private::Terminal::SupportsUnicode void RenderDiagnosticDetails(Stream &stream, std::optional offset_in_command, bool show_inline, - llvm::ArrayRef details, - bool force_ascii = false); + llvm::ArrayRef details); class DiagnosticError : public llvm::ErrorInfo { diff --git a/lldb/source/Host/common/DiagnosticsRendering.cpp b/lldb/source/Host/common/DiagnosticsRendering.cpp index 2c9d33a6c325c..f2cd3968967fb 100644 --- a/lldb/source/Host/common/DiagnosticsRendering.cpp +++ b/lldb/source/Host/common/DiagnosticsRendering.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "lldb/Host/common/DiagnosticsRendering.h" -#include "lldb/Host/Terminal.h" - #include using namespace lldb_private; @@ -87,8 +85,7 @@ static llvm::raw_ostream &PrintSeverity(Stream &stream, void RenderDiagnosticDetails(Stream &stream, std::optional offset_in_command, bool show_inline, - llvm::ArrayRef details, - bool force_ascii) { + llvm::ArrayRef details) { if (details.empty()) return; @@ -100,8 +97,12 @@ void RenderDiagnosticDetails(Stream &stream, return; } + // Since there is no other way to find this out, use the color + // attribute as a proxy for whether the terminal supports Unicode + // characters. In the future it might make sense to move this into + // Host so it can be customized for a specific platform. llvm::StringRef cursor, underline, vbar, joint, hbar, spacer; - if (Terminal::SupportsUnicode() && !force_ascii) { + if (stream.AsRawOstream().colors_enabled()) { cursor = "˄"; underline = "˜"; vbar = "│"; diff --git a/lldb/source/Host/common/Terminal.cpp b/lldb/source/Host/common/Terminal.cpp index d3647835e3937..436dfd8130d9b 100644 --- a/lldb/source/Host/common/Terminal.cpp +++ b/lldb/source/Host/common/Terminal.cpp @@ -400,22 +400,6 @@ llvm::Error Terminal::SetHardwareFlowControl(bool enabled) { #endif // LLDB_ENABLE_TERMIOS } -bool Terminal::SupportsUnicode() { - static std::optional g_result; - if (g_result) - return g_result.value(); -#ifdef _WIN32 - return true; -#else - const char *lang_var = std::getenv("LANG"); - if (!lang_var) - return false; - g_result = - llvm::StringRef(lang_var).lower().find("utf-8") != std::string::npos; -#endif - return g_result.value(); -} - TerminalState::TerminalState(Terminal term, bool save_process_group) : m_tty(term) { Save(term, save_process_group); diff --git a/lldb/test/Shell/Commands/command-dwim-print.test b/lldb/test/Shell/Commands/command-dwim-print.test index 88e7314976ad8..9153edbd21791 100644 --- a/lldb/test/Shell/Commands/command-dwim-print.test +++ b/lldb/test/Shell/Commands/command-dwim-print.test @@ -1,16 +1,16 @@ # RUN: echo quit | %lldb -o "dwim-print a" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK1 # (lldb) dwim-print a -# CHECK1:{{^ (\^|˄)}} +# CHECK1:{{^ \^}} # CHECK1: {{^ error: use of undeclared identifier 'a'}} # RUN: echo quit | %lldb -o "p a" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK2 # (lldb) p a -# CHECK2:{{^ (\^|˄)}} +# CHECK2:{{^ \^}} # RUN: echo quit | %lldb -o "dwim-print -- a" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK3 # (lldb) dwim-print -- a -# CHECK3:{{^ (\^|˄)}} +# CHECK3:{{^ \^}} # RUN: echo quit | %lldb -o "settings set show-inline-diagnostics false" \ # RUN: -o "dwim-print a" 2>&1 | FileCheck %s --check-prefix=CHECK4 # CHECK4: error: :1:1: use of undeclared identifier diff --git a/lldb/test/Shell/Commands/command-expr-diagnostics.test b/lldb/test/Shell/Commands/command-expr-diagnostics.test index cde0e6c6768f7..3c827fb4516ec 100644 --- a/lldb/test/Shell/Commands/command-expr-diagnostics.test +++ b/lldb/test/Shell/Commands/command-expr-diagnostics.test @@ -2,19 +2,19 @@ # RUN: echo quit | %lldb -o "expression a+b" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK1 # (lldb) expression a+b -# CHECK1:{{^ (\^|˄) (\^|˄)}} -# CHECK1: {{^ (\||│) error: use of undeclared identifier 'b'}} +# CHECK1:{{^ \^ \^}} +# CHECK1: {{^ | error: use of undeclared identifier 'b'}} # CHECK1: {{^ error: use of undeclared identifier 'a'}} # RUN: echo quit | %lldb -o "expr a" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK2 # (lldb) expr a -# CHECK2:{{^ (\^|˄)}} +# CHECK2:{{^ \^}} # RUN: echo quit | %lldb -o "expr -i 0 -o 0 -- a" \ # RUN: | FileCheck %s --strict-whitespace --check-prefix=CHECK3 # (lldb) expr -i 0 -o 0 -- a -# CHECK3:{{^ (\^|˄)}} +# CHECK3:{{^ \^}} # CHECK3: {{^ error: use of undeclared identifier 'a'}} # RUN: echo "int main(){return 0;}">%t.c @@ -23,7 +23,7 @@ # RUN: "expr --top-level -- template T FOO(T x) { return x/2;}" -o \ # RUN: "expression -- FOO(\"\")" 2>&1 | FileCheck %s --check-prefix=CHECK4 # (lldb) expression -- FOO("") -# CHECK4:{{^ (\^|˄)}} +# CHECK4:{{^ \^}} # CHECK4: {{^ note: in instantiation of function template}} # CHECK4: error: details) { StreamString stream; - RenderDiagnosticDetails(stream, 0, true, details, /*force_ascii=*/true); + RenderDiagnosticDetails(stream, 0, true, details); return stream.GetData(); } } // namespace From 4f5071ffe3772c4e04df81b2ee474548ea430815 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 11 Dec 2025 11:37:22 +0100 Subject: [PATCH 43/49] InstCombine: Add baseline test for #64697 fmul reassociation (#171725) Currently fmul is not reassociated unless it has nsz, although this should be unnecessary. --- .../InstCombine/issue64967-reassoc-fmul.ll | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll diff --git a/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll b/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll new file mode 100644 index 0000000000000..16f9cf2dd64c5 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +; Show that unlike fadd, fmul does not require nsz to be reassociated. + +; Can't reassociate anyway +define float @fmul(float %x) { +; CHECK-LABEL: define float @fmul( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul float [[X]], 2.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul float [[FMUL0]], 4.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul float %x, 2.0 + %fmul1 = fmul float %fmul0, 4.0 + ret float %fmul1 +} + +; Should be able to reassociate without nsz +; (+0 * 2) * 4 = +0 +; (-0 * 2) * 4 = -0 + +; (+0 * 8) = +0 +; (-0 * 8) = -0 +define float @fmul_reassoc(float %x) { +; CHECK-LABEL: define float @fmul_reassoc( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], 2.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], 4.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul reassoc float %x, 2.0 + %fmul1 = fmul reassoc float %fmul0, 4.0 + ret float %fmul1 +} + +define <2 x float> @fmul_reassoc_v2(<2 x float> %x) { +; CHECK-LABEL: define <2 x float> @fmul_reassoc_v2( +; CHECK-SAME: <2 x float> [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc <2 x float> [[X]], splat (float 2.000000e+00) +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc <2 x float> [[FMUL0]], splat (float 4.000000e+00) +; CHECK-NEXT: ret <2 x float> [[FMUL1]] +; + %fmul0 = fmul reassoc <2 x float> %x, splat (float 2.0) + %fmul1 = fmul reassoc <2 x float> %fmul0, splat (float 4.0) + ret <2 x float> %fmul1 +} + +; (+0 * 2) * -4 = -0 +; (-0 * 2) * -4 = +0 + +; (+0 * -8) = -0 +; (-0 * -8) = +0 +define float @fmul_reassoc_negative_0(float %x) { +; CHECK-LABEL: define float @fmul_reassoc_negative_0( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], 2.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], -4.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul reassoc float %x, 2.0 + %fmul1 = fmul reassoc float %fmul0, -4.0 + ret float %fmul1 +} + +; (+0 * -2) * 4 = -0 +; (-0 * -2) * 4 = +0 + +; (+0 * -8) = -0 +; (-0 * -8) = +0 +define float @fmul_reassoc_negative_1(float %x) { +; CHECK-LABEL: define float @fmul_reassoc_negative_1( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], -2.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], 4.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul reassoc float %x, -2.0 + %fmul1 = fmul reassoc float %fmul0, 4.0 + ret float %fmul1 +} + +; Does reassociate already, unnecessarily requires nsz on both multiplies. +define float @fmul_reassoc_nsz(float %x) { +; CHECK-LABEL: define float @fmul_reassoc_nsz( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc nsz float [[X]], 8.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul nsz reassoc float %x, 2.0 + %fmul1 = fmul nsz reassoc float %fmul0, 4.0 + ret float %fmul1 +} + +define float @fmul_reassoc_posk_neg0(float %x) { +; CHECK-LABEL: define float @fmul_reassoc_posk_neg0( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], 4.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], -0.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul reassoc float %x, 4.0 + %fmul1 = fmul reassoc float %fmul0, -0.0 + ret float %fmul1 +} + +define float @fmul_reassoc_neg0_posk(float %x) { +; CHECK-LABEL: define float @fmul_reassoc_neg0_posk( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], -0.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], 4.000000e+00 +; CHECK-NEXT: ret float [[FMUL1]] +; + %fmul0 = fmul reassoc float %x, -0.0 + %fmul1 = fmul reassoc float %fmul0, 4.0 + ret float %fmul1 +} From f8d1f53bb68327c4beba83379385a25e3f1152f3 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Thu, 11 Dec 2025 10:49:13 +0000 Subject: [PATCH 44/49] [mlir][scf] Add value bound for computed upper bound of forall loop (#171158) Add additional bound for the induction variable of the scf.forall such that: %iv <= %lower_bound + (%trip_count - 1) * step Same as https://github.com/llvm/llvm-project/pull/126426 but for scf.forall loop --- .../SCF/IR/ValueBoundsOpInterfaceImpl.cpp | 63 ++++++++++--------- .../SCF/value-bounds-op-interface-impl.mlir | 9 +++ 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp index 410a6bffd345e..496a7b036e65d 100644 --- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp @@ -17,19 +17,36 @@ namespace mlir { namespace scf { namespace { +static AffineExpr getTripCountExpr(OpFoldResult lb, OpFoldResult ub, + OpFoldResult step, + ValueBoundsConstraintSet &cstr) { + AffineExpr lbExpr = cstr.getExpr(lb); + AffineExpr ubExpr = cstr.getExpr(ub); + AffineExpr stepExpr = cstr.getExpr(step); + AffineExpr tripCountExpr = + AffineExpr(ubExpr - lbExpr).ceilDiv(stepExpr); // (ub - lb) / step + return tripCountExpr; +} + +static void populateIVBounds(OpFoldResult lb, OpFoldResult ub, + OpFoldResult step, Value iv, + ValueBoundsConstraintSet &cstr) { + cstr.bound(iv) >= cstr.getExpr(lb); + cstr.bound(iv) < cstr.getExpr(ub); + // iv <= lb + ((ub-lb)/step - 1) * step + // This bound does not replace the `iv < ub` constraint mentioned above, + // since constraints involving the multiplication of two constraint set + // dimensions are not supported. + AffineExpr tripCountMinusOne = + getTripCountExpr(lb, ub, step, cstr) - cstr.getExpr(1); + AffineExpr computedUpperBound = + cstr.getExpr(lb) + AffineExpr(tripCountMinusOne * cstr.getExpr(step)); + cstr.bound(iv) <= computedUpperBound; +} + struct ForOpInterface : public ValueBoundsOpInterface::ExternalModel { - static AffineExpr getTripCountExpr(scf::ForOp forOp, - ValueBoundsConstraintSet &cstr) { - AffineExpr lbExpr = cstr.getExpr(forOp.getLowerBound()); - AffineExpr ubExpr = cstr.getExpr(forOp.getUpperBound()); - AffineExpr stepExpr = cstr.getExpr(forOp.getStep()); - AffineExpr tripCountExpr = - AffineExpr(ubExpr - lbExpr).ceilDiv(stepExpr); // (ub - lb) / step - return tripCountExpr; - } - /// Populate bounds of values/dimensions for iter_args/OpResults. If the /// value/dimension size does not change in an iteration, we can deduce that /// it the same as the initial value/dimension. @@ -87,7 +104,8 @@ struct ForOpInterface // `value` is result of `forOp`, we can prove that: // %result == %init_arg + trip_count * (%yielded_value - %iter_arg). // Where trip_count is (ub - lb) / step. - AffineExpr tripCountExpr = getTripCountExpr(forOp, cstr); + AffineExpr tripCountExpr = getTripCountExpr( + forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), cstr); AffineExpr oneIterAdvanceExpr = cstr.getExpr(yieldedValue) - cstr.getExpr(iterArg); cstr.bound(value) == @@ -99,19 +117,8 @@ struct ForOpInterface auto forOp = cast(op); if (value == forOp.getInductionVar()) { - cstr.bound(value) >= forOp.getLowerBound(); - cstr.bound(value) < forOp.getUpperBound(); - // iv <= lb + ((ub-lb)/step - 1) * step - // This bound does not replace the `iv < ub` constraint mentioned above, - // since constraints involving the multiplication of two constraint set - // dimensions are not supported. - AffineExpr tripCountMinusOne = - getTripCountExpr(forOp, cstr) - cstr.getExpr(1); - AffineExpr computedUpperBound = - cstr.getExpr(forOp.getLowerBound()) + - AffineExpr(tripCountMinusOne * cstr.getExpr(forOp.getStep())); - cstr.bound(value) <= computedUpperBound; - return; + return populateIVBounds(forOp.getLowerBound(), forOp.getUpperBound(), + forOp.getStep(), value, cstr); } // Handle iter_args and OpResults. @@ -141,11 +148,9 @@ struct ForallOpInterface assert(blockArg.getArgNumber() < forallOp.getInductionVars().size() && "expected index value to be an induction var"); int64_t idx = blockArg.getArgNumber(); - // TODO: Take into account step size. - AffineExpr lb = cstr.getExpr(forallOp.getMixedLowerBound()[idx]); - AffineExpr ub = cstr.getExpr(forallOp.getMixedUpperBound()[idx]); - cstr.bound(value) >= lb; - cstr.bound(value) < ub; + return populateIVBounds(forallOp.getMixedLowerBound()[idx], + forallOp.getMixedUpperBound()[idx], + forallOp.getMixedStep()[idx], value, cstr); } void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim, diff --git a/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir index 339d97df001c5..60fe96d52d20b 100644 --- a/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir +++ b/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir @@ -379,3 +379,12 @@ func.func @scf_for_result_infer_dynamic_init_big_step(%i : index) { "test.compare"(%0, %7) {cmp = "LE"} : (index, index) -> () return } + +func.func @scf_forall_computed_upper_bound(%x: index) { + %c6 = arith.constant 6 : index + scf.forall (%iv) = (0) to (8) step (3) { + // expected-remark @below{{true}} + "test.compare"(%iv, %c6) {cmp = "LE"} : (index, index) -> () + } + return +} From 8af88a45ca2720b0d32da19721d6a6e80a30d197 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Thu, 11 Dec 2025 16:28:12 +0530 Subject: [PATCH 45/49] [MLIR][NVVM] Update PMEvent lowering to intrinsics (#171649) The patch updates the lowering of `id` based pmevent also to intrinsics. The mask is simply (1 << event-id). Signed-off-by: Durgadoss R --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 20 ++++++++-------- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 19 +++++++++++++++ .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 15 ------------ mlir/test/Target/LLVMIR/nvvm/pm_event.mlir | 23 +++++++++++++++++++ .../Target/LLVMIR/nvvm/pm_event_invalid.mlir | 21 +++++++++++++++++ mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 21 ----------------- mlir/test/Target/LLVMIR/nvvmir.mlir | 11 --------- 7 files changed, 73 insertions(+), 57 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/nvvm/pm_event.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index a0a00513d7da5..51d310970fda9 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -543,7 +543,7 @@ def NVVM_NanosleepOp : NVVM_Op<"nanosleep">, // NVVM Performance Monitor events //===----------------------------------------------------------------------===// -def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">, +def NVVM_PMEventOp : NVVM_Op<"pmevent">, Arguments<(ins OptionalAttr:$maskedEventId, OptionalAttr:$eventId)> { let summary = "Trigger one or more Performance Monitor events."; @@ -561,20 +561,20 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">, [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent) }]; - string llvmBuilder = [{ - llvm::Value *mId = builder.getInt16(* $maskedEventId); - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_pm_event_mask, {mId}); - }]; - let assemblyFormat = "attr-dict (`id` `=` $eventId^)? (`mask` `=` $maskedEventId^)?"; + let hasVerifier = 1; let extraClassDeclaration = [{ - bool hasIntrinsic() { return !getEventId(); } + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("pmevent %0;"); } + + string llvmBuilder = [{ + auto [id, args] = NVVM::PMEventOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; - let hasVerifier = 1; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 89197ec2f50b6..fd84ed6399d5d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -2476,6 +2476,25 @@ mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs( return {id, std::move(args)}; } +mlir::NVVM::IDArgPair +PMEventOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + llvm::Type *i16Ty = llvm::Type::getInt16Ty(mt.getLLVMContext()); + + // With event-id, mask is generated as (1 << event-id) + llvm::Value *maskVal; + if (auto eventAttr = thisOp.getEventIdAttr()) { + uint16_t mask = static_cast(1u << eventAttr.getInt()); + maskVal = llvm::ConstantInt::get(i16Ty, mask); + } else { + maskVal = + llvm::ConstantInt::get(i16Ty, thisOp.getMaskedEventIdAttr().getValue()); + } + + return {llvm::Intrinsic::nvvm_pm_event_mask, {maskVal}}; +} + mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast(op); diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 8fb36ace2c463..c4b8e93b6a9f9 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -678,21 +678,6 @@ llvm.func @inline_ptx_multi_rw_r(%a : i32, %b : i32, %rw_c : f32, %rw_d : f32) llvm.return %r5 : f32 } - -// ----- - -// CHECK-LABEL: @nvvm_pmevent -llvm.func @nvvm_pmevent() { - // CHECK: %[[S0:.+]] = llvm.mlir.constant(10 : i32) : i32 - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S0]] : (i32) -> () - - nvvm.pmevent id = 10 - // CHECK: %[[S1:.+]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "pmevent $0;", "n" %[[S1]] : (i32) -> () - nvvm.pmevent id = 4 - llvm.return -} - // ----- llvm.func @inline_ptx_pack_4i8(%src : vector<4xi8>, %mask : i32, %zero: i32) { diff --git a/mlir/test/Target/LLVMIR/nvvm/pm_event.mlir b/mlir/test/Target/LLVMIR/nvvm/pm_event.mlir new file mode 100644 index 0000000000000..0092d32319a83 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/pm_event.mlir @@ -0,0 +1,23 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @nvvm_pmevent_mask() { + // CHECK-LABEL: define void @nvvm_pmevent_mask() { + // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 15000) + // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 4) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.pmevent mask = 15000 + nvvm.pmevent mask = 4 + llvm.return +} + +llvm.func @nvvm_pmevent_id() { + // CHECK-LABEL: define void @nvvm_pmevent_id() { + // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 1024) + // CHECK-NEXT: call void @llvm.nvvm.pm.event.mask(i16 16) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.pmevent id = 10 + nvvm.pmevent id = 4 + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir new file mode 100644 index 0000000000000..783988fb36368 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/pm_event_invalid.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +llvm.func @pmevent_no_id() { + // expected-error @below {{either `id` or `mask` must be set}} + nvvm.pmevent +} + +// ----- + +llvm.func @pmevent_bigger15() { + // expected-error @below {{`id` must be between 0 and 15}} + nvvm.pmevent id = 16 +} + +// ----- + +llvm.func @pmevent_many_ids() { + // expected-error @below {{`id` and `mask` cannot be set at the same time}} + nvvm.pmevent id = 1 mask = 1 +} + diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir index d5868ee73cc50..c0fe0fa11f497 100644 --- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -1,26 +1,5 @@ // RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s -llvm.func @pmevent_no_id() { - // expected-error @below {{either `id` or `mask` must be set}} - nvvm.pmevent -} - -// ----- - -llvm.func @pmevent_bigger15() { - // expected-error @below {{`id` must be between 0 and 15}} - nvvm.pmevent id = 141 -} - -// ----- - -llvm.func @pmevent_many_ids() { - // expected-error @below {{`id` and `mask` cannot be set at the same time}} - nvvm.pmevent id = 1 mask = 1 -} - -// ----- - llvm.func @kernel_func(%numberOfThreads : i32) { // expected-error @below {{'nvvm.barrier' op barrier id is missing, it should be set between 0 to 15}} nvvm.barrier number_of_threads = %numberOfThreads diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index c4a69097692cb..9e4aadac69896 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -903,17 +903,6 @@ llvm.func @nvvm_dot_accumulate_2way(%a: vector<2xi16>, %b: vector<4xi8>, %c: i32 // ----- -// CHECK-LABEL: @nvvm_pmevent -llvm.func @nvvm_pmevent() { - // CHECK: call void @llvm.nvvm.pm.event.mask(i16 15000) - nvvm.pmevent mask = 15000 - // CHECK: call void @llvm.nvvm.pm.event.mask(i16 4) - nvvm.pmevent mask = 4 - llvm.return -} - -// ----- - // CHECK-LABEL: @nanosleep llvm.func @nanosleep(%duration: i32) { // CHECK: call void @llvm.nvvm.nanosleep(i32 %{{.*}}) From fbc121ce1ee398176bdfeb482b0549e03a2bb5c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gergely=20B=C3=A1lint?= Date: Thu, 11 Dec 2025 11:58:24 +0100 Subject: [PATCH 46/49] [BOLT][BTI] Add MCPlusBuilder::insertBTI (#167329) This function contains most of the logic for BTI: - it takes the BasicBlock and the instruction used to jump to it. - Then it checks if the first non-pseudo instruction is a sufficient landing pad for the used call. - if not, it generates the correct BTI instruction. Also introduce the isCallCoveredByBTI helper to simplify the logic. --- bolt/include/bolt/Core/MCPlusBuilder.h | 13 ++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 75 +++++++++++ bolt/unittests/Core/MCPlusBuilder.cpp | 116 ++++++++++++++++++ 3 files changed, 204 insertions(+) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index a318ef0b6bd68..6d0ba466347c1 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1894,6 +1894,19 @@ class MCPlusBuilder { llvm_unreachable("not implemented"); } + /// Checks if the indirect call / jump is accepted by the landing pad at the + /// start of the target BasicBlock. + virtual bool isCallCoveredByBTI(MCInst &Call, MCInst &Pad) const { + llvm_unreachable("not implemented"); + return false; + } + + /// Inserts a BTI landing pad to the start of the BB, that matches the + /// indirect call inst used to call the BB. + virtual void insertBTI(BinaryBasicBlock &BB, MCInst &Call) const { + llvm_unreachable("not implemented"); + } + /// Store \p Target absolute address to \p RegName virtual InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 5881d3fba70f6..729f854736f6c 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2806,6 +2806,81 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Inst.addOperand(MCOperand::createImm(HintNum)); } + bool isCallCoveredByBTI(MCInst &Call, MCInst &Pad) const override { + assert((isIndirectCall(Call) || isIndirectBranch(Call)) && + "Not an indirect call or branch."); + + // A BLR can be accepted by a BTI c. + if (isIndirectCall(Call)) + return isBTILandingPad(Pad, true, false) || + isBTILandingPad(Pad, true, true); + + // A BR can be accepted by a BTI j or BTI c (and BTI jc) IF the operand is + // x16 or x17. If the operand is not x16 or x17, it can be accepted by a BTI + // j or BTI jc (and not BTI c). + if (isIndirectBranch(Call)) { + assert(Call.getNumOperands() == 1 && + "Indirect branch needs to have 1 operand."); + assert(Call.getOperand(0).isReg() && + "Indirect branch does not have a register operand."); + MCPhysReg Reg = Call.getOperand(0).getReg(); + if (Reg == AArch64::X16 || Reg == AArch64::X17) + return isBTILandingPad(Pad, true, false) || + isBTILandingPad(Pad, false, true) || + isBTILandingPad(Pad, true, true); + return isBTILandingPad(Pad, false, true) || + isBTILandingPad(Pad, true, true); + } + return false; + } + + void insertBTI(BinaryBasicBlock &BB, MCInst &Call) const override { + auto II = BB.getFirstNonPseudo(); + // Only check the first instruction for non-empty BasicBlocks + bool Empty = (II == BB.end()); + if (!Empty && isCallCoveredByBTI(Call, *II)) + return; + // A BLR can be accepted by a BTI c. + if (isIndirectCall(Call)) { + // if we have a BTI j at the start, extend it to a BTI jc, + // otherwise insert a new BTI c. + if (!Empty && isBTILandingPad(*II, false, true)) { + updateBTIVariant(*II, true, true); + } else { + MCInst BTIInst; + createBTI(BTIInst, true, false); + BB.insertInstruction(II, BTIInst); + } + } + + // A BR can be accepted by a BTI j or BTI c (and BTI jc) IF the operand is + // x16 or x17. If the operand is not x16 or x17, it can be accepted by a + // BTI j or BTI jc (and not BTI c). + if (isIndirectBranch(Call)) { + assert(Call.getNumOperands() == 1 && + "Indirect branch needs to have 1 operand."); + assert(Call.getOperand(0).isReg() && + "Indirect branch does not have a register operand."); + MCPhysReg Reg = Call.getOperand(0).getReg(); + if (Reg == AArch64::X16 || Reg == AArch64::X17) { + // Add a new BTI c + MCInst BTIInst; + createBTI(BTIInst, true, false); + BB.insertInstruction(II, BTIInst); + } else { + // If BB starts with a BTI c, extend it to BTI jc, + // otherwise insert a new BTI j. + if (!Empty && isBTILandingPad(*II, true, false)) { + updateBTIVariant(*II, true, true); + } else { + MCInst BTIInst; + createBTI(BTIInst, false, true); + BB.insertInstruction(II, BTIInst); + } + } + } + } + InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, MCPhysReg RegName, int64_t Addend = 0) const override { diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp index 7b6f1620a3f2c..e8323e87fe148 100644 --- a/bolt/unittests/Core/MCPlusBuilder.cpp +++ b/bolt/unittests/Core/MCPlusBuilder.cpp @@ -198,6 +198,122 @@ TEST_P(MCPlusBuilderTester, AArch64_BTI) { ASSERT_TRUE(BC->MIB->isImplicitBTIC(*II)); } +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_empty) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X16); + BC->MIB->insertBTI(*BB, CallInst); + // Check that BTI c is added to the empty block. + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false)); +} +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_0) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst Inst = MCInstBuilder(AArch64::RET).addReg(AArch64::LR); + BB->addInstruction(Inst); + // BR x16 needs BTI c or BTI j. We prefer adding a BTI c. + MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X16); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false)); +} + +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_1) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst BTIc; + BC->MIB->createBTI(BTIc, true, false); + BB->addInstruction(BTIc); + // BR x16 needs BTI c or BTI j. We have a BTI c, no change is needed. + MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X16); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false)); +} + +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_2) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst BTIc; + BC->MIB->createBTI(BTIc, true, false); + BB->addInstruction(BTIc); + // BR x5 needs BTI j + // we have BTI c -> extend it to BTI jc. + MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X5); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, true)); +} + +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_3) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst Inst = MCInstBuilder(AArch64::RET).addReg(AArch64::LR); + BB->addInstruction(Inst); + // BR x5 needs BTI j + MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X5); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, false, true)); +} + +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_4) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst Inst = MCInstBuilder(AArch64::RET).addReg(AArch64::LR); + BB->addInstruction(Inst); + // BLR needs BTI c, regardless of the register used. + MCInst CallInst = MCInstBuilder(AArch64::BLR).addReg(AArch64::X5); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false)); +} + +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_5) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst BTIj; + BC->MIB->createBTI(BTIj, false, true); + BB->addInstruction(BTIj); + // BLR needs BTI c, regardless of the register used. + // We have a BTI j -> extend it to BTI jc. + MCInst CallInst = MCInstBuilder(AArch64::BLR).addReg(AArch64::X5); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, true)); +} + +TEST_P(MCPlusBuilderTester, AArch64_insertBTI_6) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + MCInst Paciasp = + MCInstBuilder(AArch64::PACIASP).addReg(AArch64::LR).addReg(AArch64::SP); + BB->addInstruction(Paciasp); + // PACI(AB)SP are implicit BTI c, no change needed. + MCInst CallInst = MCInstBuilder(AArch64::BR).addReg(AArch64::X17); + BC->MIB->insertBTI(*BB, CallInst); + auto II = BB->begin(); + ASSERT_TRUE(BC->MIB->isBTILandingPad(*II, true, false)); + ASSERT_TRUE(BC->MIB->isPSignOnLR(*II)); +} + TEST_P(MCPlusBuilderTester, AArch64_CmpJNE) { if (GetParam() != Triple::aarch64) GTEST_SKIP(); From 481ce8169539485fd506792f86c03329c50d4a06 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 11 Dec 2025 12:11:00 +0100 Subject: [PATCH 47/49] IR: Stop requiring nsz to reassociate fmul (#171726) nsz can only change the behavior of the sign bit. The sign bit for fmul can be implemented as xor, which is associative. DAGCombiner already reassociates the multiply by 2 constants without nsz. Fixes #64967 --- llvm/lib/IR/Instruction.cpp | 1 + .../InstCombine/2006-10-26-VectorReassoc.ll | 6 ++---- llvm/test/Transforms/InstCombine/fdiv.ll | 3 +-- .../InstCombine/issue64967-reassoc-fmul.ll | 18 ++++++------------ 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 33ca46ca1c2c6..b95c1466871bc 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -1271,6 +1271,7 @@ bool Instruction::isAssociative() const { switch (Opcode) { case FMul: + return cast(this)->hasAllowReassoc(); case FAdd: return cast(this)->hasAllowReassoc() && cast(this)->hasNoSignedZeros(); diff --git a/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll b/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll index fb860a5e7bdf3..6509797e0d3dc 100644 --- a/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll +++ b/llvm/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll @@ -35,12 +35,10 @@ define <4 x float> @test_fmul_reassoc_nsz(<4 x float> %V) { } ; (V * C1) * C2 => V * (C1 * C2) -; TODO: This doesn't require 'nsz'. It should fold to V * { 1.0, 4.0e+05, -9.0, 16.0 } define <4 x float> @test_fmul_reassoc(<4 x float> %V) { ; CHECK-LABEL: @test_fmul_reassoc( -; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc <4 x float> [[V:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[TMP1]], -; CHECK-NEXT: ret <4 x float> [[TMP2]] +; CHECK: [[TMP1:%.*]] = fmul reassoc <4 x float> %V, +; CHECK-NEXT: ret <4 x float> [[TMP1]] %Y = fmul reassoc <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 > %Z = fmul reassoc <4 x float> %Y, < float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00 > ret <4 x float> %Z diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll index 54b0bf8c50ac7..3465781e3af9d 100644 --- a/llvm/test/Transforms/InstCombine/fdiv.ll +++ b/llvm/test/Transforms/InstCombine/fdiv.ll @@ -525,8 +525,7 @@ define <2 x float> @div_constant_dividend2_reassoc_only(<2 x float> %x) { define <2 x float> @div_constant_dividend3(<2 x float> %x) { ; CHECK-LABEL: @div_constant_dividend3( -; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc arcp <2 x float> [[X:%.*]], -; CHECK-NEXT: [[T2:%.*]] = fmul reassoc arcp <2 x float> [[TMP1]], +; CHECK-NEXT: [[T2:%.*]] = fmul reassoc arcp <2 x float> [[X:%.*]], ; CHECK-NEXT: ret <2 x float> [[T2]] ; %t1 = fdiv <2 x float> , %x diff --git a/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll b/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll index 16f9cf2dd64c5..5d064234bf609 100644 --- a/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll +++ b/llvm/test/Transforms/InstCombine/issue64967-reassoc-fmul.ll @@ -25,8 +25,7 @@ define float @fmul(float %x) { define float @fmul_reassoc(float %x) { ; CHECK-LABEL: define float @fmul_reassoc( ; CHECK-SAME: float [[X:%.*]]) { -; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], 2.000000e+00 -; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], 4.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[X]], 8.000000e+00 ; CHECK-NEXT: ret float [[FMUL1]] ; %fmul0 = fmul reassoc float %x, 2.0 @@ -37,8 +36,7 @@ define float @fmul_reassoc(float %x) { define <2 x float> @fmul_reassoc_v2(<2 x float> %x) { ; CHECK-LABEL: define <2 x float> @fmul_reassoc_v2( ; CHECK-SAME: <2 x float> [[X:%.*]]) { -; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc <2 x float> [[X]], splat (float 2.000000e+00) -; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc <2 x float> [[FMUL0]], splat (float 4.000000e+00) +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc <2 x float> [[X]], splat (float 8.000000e+00) ; CHECK-NEXT: ret <2 x float> [[FMUL1]] ; %fmul0 = fmul reassoc <2 x float> %x, splat (float 2.0) @@ -54,8 +52,7 @@ define <2 x float> @fmul_reassoc_v2(<2 x float> %x) { define float @fmul_reassoc_negative_0(float %x) { ; CHECK-LABEL: define float @fmul_reassoc_negative_0( ; CHECK-SAME: float [[X:%.*]]) { -; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], 2.000000e+00 -; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], -4.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[X]], -8.000000e+00 ; CHECK-NEXT: ret float [[FMUL1]] ; %fmul0 = fmul reassoc float %x, 2.0 @@ -71,8 +68,7 @@ define float @fmul_reassoc_negative_0(float %x) { define float @fmul_reassoc_negative_1(float %x) { ; CHECK-LABEL: define float @fmul_reassoc_negative_1( ; CHECK-SAME: float [[X:%.*]]) { -; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], -2.000000e+00 -; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], 4.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[X]], -8.000000e+00 ; CHECK-NEXT: ret float [[FMUL1]] ; %fmul0 = fmul reassoc float %x, -2.0 @@ -95,8 +91,7 @@ define float @fmul_reassoc_nsz(float %x) { define float @fmul_reassoc_posk_neg0(float %x) { ; CHECK-LABEL: define float @fmul_reassoc_posk_neg0( ; CHECK-SAME: float [[X:%.*]]) { -; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], 4.000000e+00 -; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], -0.000000e+00 +; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[X]], -0.000000e+00 ; CHECK-NEXT: ret float [[FMUL1]] ; %fmul0 = fmul reassoc float %x, 4.0 @@ -108,8 +103,7 @@ define float @fmul_reassoc_neg0_posk(float %x) { ; CHECK-LABEL: define float @fmul_reassoc_neg0_posk( ; CHECK-SAME: float [[X:%.*]]) { ; CHECK-NEXT: [[FMUL0:%.*]] = fmul reassoc float [[X]], -0.000000e+00 -; CHECK-NEXT: [[FMUL1:%.*]] = fmul reassoc float [[FMUL0]], 4.000000e+00 -; CHECK-NEXT: ret float [[FMUL1]] +; CHECK-NEXT: ret float [[FMUL0]] ; %fmul0 = fmul reassoc float %x, -0.0 %fmul1 = fmul reassoc float %fmul0, 4.0 From d0767e96f9c1f46de60ae6c592459f830af05cca Mon Sep 17 00:00:00 2001 From: anoopkg6 Date: Thu, 11 Dec 2025 05:11:50 -0600 Subject: [PATCH 48/49] [JITLink] Add TLS support for SystemZ (#171559) This patch adds TLS support for SystemZ on top of orc-runtime support. A separate orc-runtime support #171062 has been created from earlier TLS support #[170706](https://github.com/llvm/llvm-project/pull/170706). See conversations in [#170706](https://github.com/llvm/llvm-project/pull/170706) --------- Co-authored-by: anoopkg6 --- compiler-rt/lib/orc/CMakeLists.txt | 1 + compiler-rt/lib/orc/elfnix_tls.systemz.S | 42 ++++++++++++ .../orc/TestCases/Linux/systemz/trivial-tls.S | 67 +++++++++++++++++++ .../llvm/ExecutionEngine/JITLink/systemz.h | 15 +++++ .../ExecutionEngine/JITLink/ELF_systemz.cpp | 66 +++++++++++++++++- llvm/lib/ExecutionEngine/JITLink/systemz.cpp | 2 + .../ExecutionEngine/Orc/ELFNixPlatform.cpp | 5 ++ 7 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 compiler-rt/lib/orc/elfnix_tls.systemz.S create mode 100644 compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S diff --git a/compiler-rt/lib/orc/CMakeLists.txt b/compiler-rt/lib/orc/CMakeLists.txt index b8d1b03b788c9..649d988d9d608 100644 --- a/compiler-rt/lib/orc/CMakeLists.txt +++ b/compiler-rt/lib/orc/CMakeLists.txt @@ -119,6 +119,7 @@ else() # not Apple elfnix_tls.x86-64.S elfnix_tls.aarch64.S elfnix_tls.ppc64.S + elfnix_tls.systemz.S sysv_reenter.arm64.S sysv_reenter.x86-64.S ) diff --git a/compiler-rt/lib/orc/elfnix_tls.systemz.S b/compiler-rt/lib/orc/elfnix_tls.systemz.S new file mode 100644 index 0000000000000..4e116c92a5a88 --- /dev/null +++ b/compiler-rt/lib/orc/elfnix_tls.systemz.S @@ -0,0 +1,42 @@ +//===-- orc_rt_elfnix_tls_systemz.s -------------------------------*- ASM -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of the ORC runtime support library. +// +//===----------------------------------------------------------------------===// + +// The special thing about the s390 TLS ABI is that we do not have the +// standard __tls_get_addr function but the __tls_get_offset function +// which differs in two important aspects: +// 1) __tls_get_offset gets a got offset instead of a pointer to the +// tls_index structure +// 2) __tls_get_offset returns the offset of the requested variable to +// the thread descriptor instead of a pointer to the variable. + +// The content of this file is systemz-only + +#if defined(__s390x__) + + .text + // returns offset of TLV from TP in %r2. + .globl ___orc_rt_elfnix_tls_get_offset +___orc_rt_elfnix_tls_get_offset: + stmg %r14, %r15, 112(%r15) + aghi %r15, -160 + // Pass pointer to tls_index. + la %r2, 0(%r2, %r12) + brasl %r14, __orc_rt_elfnix_tls_get_addr_impl + // Return offset from TP. + ear %r0, %a0 + sllg %r0, %r0, 32 + ear %r0, %a1 + sgr %r2, %r0 + lmg %r14, %r15, 272(%r15) + br %r14 + +#endif // defined(__s390x__) diff --git a/compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S b/compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S new file mode 100644 index 0000000000000..4bf1c578bd1d7 --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Linux/systemz/trivial-tls.S @@ -0,0 +1,67 @@ +// RUN: %clang -c -o %t %s +// RUN: %llvm_jitlink %t +// +// Test that basic ELF TLS work by adding together TLSs with values +// 0, 1, and -1, and returning the result (0 for success). This setup +// tests both zero-initialized (.tbss) and non-zero-initialized +// (.tdata) sections. + + .section .data.rel.ro,"aw",@progbits + .p2align 3, 0x0 +.LCPI0_0: + .quad x@TLSGD +.LCPI0_1: + .quad y@TLSGD +.LCPI0_2: + .quad z@TLSGD + + .text + .globl main + .p2align 4 + .type main,@function +main: + stmg %r10, %r15, 80(%r15) + aghi %r15, -160 + lgrl %r2, .LCPI0_0 + larl %r12, _GLOBAL_OFFSET_TABLE_ + brasl %r14, __tls_get_offset@PLT:tls_gdcall:x + lgr %r13, %r2 + lgrl %r2, .LCPI0_1 + brasl %r14, __tls_get_offset@PLT:tls_gdcall:y + ear %r0, %a0 + sllg %r11, %r0, 32 + ear %r11, %a1 + l %r10, 0(%r2,%r11) + lgrl %r2, .LCPI0_2 + a %r10, 0(%r13,%r11) + brasl %r14, __tls_get_offset@PLT:tls_gdcall:z + a %r10, 0(%r2,%r11) + lgfr %r2, %r10 + lmg %r10, %r15, 240(%r15) + br %r14 +.Lfunc_end0: + .size main, .Lfunc_end0-main + + + .type x,@object # @x + .section .tbss,"awT",@nobits + .globl x + .p2align 2, 0x0 +x: + .long 0 # 0x0 + .size x, 4 + + .type y,@object # @y + .section .tdata,"awT",@progbits + .globl y + .p2align 2, 0x0 +y: + .long 1 # 0x1 + .size y, 4 + + .type z,@object # @z + .globl z + .p2align 2, 0x0 +z: + .long 4294967295 # 0xffffffff + .size z, 4 diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h index 09ec56db6826f..bfd22ec753074 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h @@ -507,6 +507,21 @@ enum EdgeKind_systemz : Edge::Kind { /// RequestGOTAndTransformToDelta32dbl, + /// A TLSInfo entry getter/constructor, transformed to Delta64FromGOT. + /// + /// Indicates that this edge should be transformed into a Delta64FromGOT + /// targeting the TLSInfo entry for the edge's current target. A TLSInfo + /// entry for the target should be created if one does not already exist. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// phase will result in an assert/unreachable during the fixup phase. + /// + RequestTLSDescInGOTAndTransformToDelta64FromGOT, + /// A 32-bit Delta to GOT base. /// /// Fixup expression: diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp index 29eeecceea766..50acd6ea2e542 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp @@ -27,12 +27,67 @@ using namespace llvm::jitlink; namespace { constexpr StringRef ELFGOTSymbolName = "_GLOBAL_OFFSET_TABLE_"; +constexpr StringRef ELFTLSInfoSectionName = "$__TLSINFO"; + +// TLS Info Builder. +class TLSInfoTableManager_ELF_systemz + : public TableManager { +public: + static StringRef getSectionName() { return ELFTLSInfoSectionName; } + + static const uint8_t TLSInfoEntryContent[16]; + + bool visitEdge(LinkGraph &G, Block *B, Edge &E) { + if (E.getKind() == + systemz::RequestTLSDescInGOTAndTransformToDelta64FromGOT) { + LLVM_DEBUG({ + dbgs() << " Fixing " << G.getEdgeKindName(E.getKind()) << " edge at " + << formatv("{0:x}", B->getFixupAddress(E)) << " (" + << formatv("{0:x}", B->getAddress()) << " + " + << formatv("{0:x}", E.getOffset()) << ")\n"; + }); + E.setKind(systemz::Delta64FromGOT); + E.setTarget(getEntryForTarget(G, E.getTarget())); + return true; + } + return false; + } + + Symbol &createEntry(LinkGraph &G, Symbol &Target) { + // the TLS Info entry's key value will be written by the fixTLVSectionByName + // pass, so create mutable content. + auto &TLSInfoEntry = G.createMutableContentBlock( + getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()), + orc::ExecutorAddr(), 8, 0); + TLSInfoEntry.addEdge(systemz::Pointer64, 8, Target, 0); + return G.addAnonymousSymbol(TLSInfoEntry, 0, 16, false, false); + } + +private: + Section &getTLSInfoSection(LinkGraph &G) { + if (!TLSInfoTable) + TLSInfoTable = &G.createSection(getSectionName(), orc::MemProt::Read); + return *TLSInfoTable; + } + + ArrayRef getTLSInfoEntryContent() const { + return {reinterpret_cast(TLSInfoEntryContent), + sizeof(TLSInfoEntryContent)}; + } + + Section *TLSInfoTable = nullptr; +}; + +const uint8_t TLSInfoTableManager_ELF_systemz::TLSInfoEntryContent[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; Error buildTables_ELF_systemz(LinkGraph &G) { LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n"); systemz::GOTTableManager GOT; systemz::PLTTableManager PLT(GOT); - visitExistingEdges(G, GOT, PLT); + TLSInfoTableManager_ELF_systemz TLSInfo; + visitExistingEdges(G, GOT, PLT, TLSInfo); return Error::success(); } @@ -329,6 +384,15 @@ class ELFLinkGraphBuilder_systemz Kind = systemz::Delta32dblGOTBase; break; } + // Tag for function call in general dynamic TLS code. + case ELF::R_390_TLS_GDCALL: { + break; + } + // Direct 64 bit for general dynamic thread local data. + case ELF::R_390_TLS_GD64: { + Kind = systemz::RequestTLSDescInGOTAndTransformToDelta64FromGOT; + break; + } default: return make_error( "In " + G->getName() + ": Unsupported systemz relocation type " + diff --git a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp index f6cc29fa6e6a1..dbb924c3f9291 100644 --- a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp @@ -104,6 +104,8 @@ const char *getEdgeKindName(Edge::Kind R) { return "RequestGOTAndTransformToDelta12FromGOT"; case RequestGOTAndTransformToDelta32dbl: return "RequestGOTAndTransformToDelta32dbl"; + case RequestTLSDescInGOTAndTransformToDelta64FromGOT: + return "RequestTLSDescInGOTAndTransformToDelta64FromGOT"; default: return getGenericEdgeKindName(static_cast(R)); } diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index 7dc1ae520f132..0a761290373aa 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -988,6 +988,7 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges( jitlink::LinkGraph &G, JITDylib &JD) { auto TLSGetAddrSymbolName = G.intern("__tls_get_addr"); auto TLSDescResolveSymbolName = G.intern("__tlsdesc_resolver"); + auto TLSGetOffsetSymbolName = G.intern("__tls_get_offset"); for (auto *Sym : G.external_symbols()) { if (Sym->getName() == TLSGetAddrSymbolName) { auto TLSGetAddr = @@ -997,6 +998,10 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges( auto TLSGetAddr = MP.getExecutionSession().intern("___orc_rt_elfnix_tlsdesc_resolver"); Sym->setName(std::move(TLSGetAddr)); + } else if (Sym->getName() == TLSGetOffsetSymbolName) { + auto TLSGetAddr = + MP.getExecutionSession().intern("___orc_rt_elfnix_tls_get_offset"); + Sym->setName(std::move(TLSGetAddr)); } } From c89d87a512ed5b4bfeedb5d7ae22ae8cba1b123a Mon Sep 17 00:00:00 2001 From: Omair Javaid Date: Thu, 11 Dec 2025 16:15:10 +0500 Subject: [PATCH 49/49] [lldb][test] Fix toolchain-msvc.test for native ARM64 MSVC environment (#171797) This patch fixes toolchain-msvc.test on Windows ARM64 hosts running under native ARM64 environment via vcvarsarm64.bat. Our lab buildbot recently switched from using cross vcvarsamd64_arm64.bat environment to native vcvarsarm64.bat. This patch updates FileCheck patterns to also allow HostARM64 and arm64 PATH entries. Changes: -> Extend host regex to match HostARM64 (case-insensitive) -> Allow arm64 in PATH tail. -> Apply same fix in both 32-bit and 64-bit sections. --- lldb/test/Shell/BuildScript/toolchain-msvc.test | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lldb/test/Shell/BuildScript/toolchain-msvc.test b/lldb/test/Shell/BuildScript/toolchain-msvc.test index dce87d5aee2af..bde895fa6dbaf 100644 --- a/lldb/test/Shell/BuildScript/toolchain-msvc.test +++ b/lldb/test/Shell/BuildScript/toolchain-msvc.test @@ -23,15 +23,15 @@ RUN: | FileCheck --check-prefix=64BIT %s 32BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.pdb 32BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.exe 32BIT: compiling foobar.c -> foo.exe-foobar.obj -32BIT: Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x86|arm)}}\cl.{{EXE|exe}} +32BIT: Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|arm)}}\cl.{{EXE|exe}} 32BIT: linking foo.exe-foobar.obj -> foo.exe -32BIT: Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x86|arm)}}\link.{{EXE|exe}} +32BIT: Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|arm)}}\link.{{EXE|exe}} 32BIT: Env 32BIT: LIB = {{.*}}\ATLMFC\lib\{{(x86|arm)}} 32BIT: {{.*}}\lib\{{(x86|arm)}} 32BIT: {{.*}}\ucrt\{{(x86|arm)}} 32BIT: {{.*}}\um\{{(x86|arm)}} -32BIT: PATH = {{.*}}\bin\{{[Hh]ost[Xx](64|86)}}\{{(x86|x64)}} +32BIT: PATH = {{.*}}\bin\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|x64|arm64)}} 64BIT: Script Arguments: @@ -51,12 +51,12 @@ RUN: | FileCheck --check-prefix=64BIT %s 64BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.pdb 64BIT: Cleaning {{.*}}toolchain-msvc.test.tmp\foo.exe 64BIT: compiling foobar.c -> foo.exe-foobar.obj -64BIT: Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x64|arm64)}}\cl.{{EXE|exe}} +64BIT: Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x64|arm64)}}\cl.{{EXE|exe}} 64BIT: linking foo.exe-foobar.obj -> foo.exe -64BIT: Command Line: {{.*}}\{{[Hh]ost[Xx](64|86)}}\{{(x64|arm64)}}\link.{{EXE|exe}} +64BIT: Command Line: {{.*}}\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x64|arm64)}}\link.{{EXE|exe}} 64BIT: Env 64BIT: LIB = {{.*}}\ATLMFC\lib\{{(x64|arm64)}} 64BIT: {{.*}}\lib\{{(x64|arm64)}} 64BIT: {{.*}}\ucrt\{{(x64|arm64)}} 64BIT: {{.*}}\um\{{(x64|arm64)}} -64BIT: PATH = {{.*}}\bin\{{[Hh]ost[Xx](64|86)}}\{{(x86|x64)}} +64BIT: PATH = {{.*}}\bin\{{[Hh]ost([Xx](64|86)|(arm64|ARM64))}}\{{(x86|x64|arm64)}}