From 10c69e3b10a98c4f370863daf482c84c7e7cbaed Mon Sep 17 00:00:00 2001 From: lalaniket8 Date: Mon, 8 Dec 2025 16:55:06 +0530 Subject: [PATCH 1/4] Wave Transform should generate non SSA Exec mask manipulation instrs --- .../llvm/CodeGen/MachineRegisterInfo.h | 5 + llvm/lib/CodeGen/MachineRegisterInfo.cpp | 14 +++ .../lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 43 ++++--- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 113 ++++++++++-------- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h | 17 +-- 5 files changed, 118 insertions(+), 74 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 737b74ef3f761..7195f75ddc69b 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -617,6 +617,11 @@ class MachineRegisterInfo { /// form, so there should only be one definition. LLVM_ABI MachineInstr *getVRegDef(Register Reg) const; + /// getDomVRegDefInBasicBlock - Return the last machine instr that defines + /// the specified virtual register in the basic block, searching backwards + /// from instruction I (exclusive). Returns MBB.end() if no definition is found. + LLVM_ABI MachineBasicBlock::iterator getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + /// getUniqueVRegDef - Return the unique machine instr that defines the /// specified virtual register or null if none is found. If there are /// multiple definitions or no definition, return null. diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 094315b3903ea..93bddc7c33928 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -674,3 +674,17 @@ bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const { } return false; } + +/// getDomVRegDefInBasicBlock - Return the last machine instr that defines +/// the specified virtual register in the basic block, searching backwards +/// from instruction I (exclusive). Returns MBB.end() if no definition is found. +MachineBasicBlock::iterator MachineRegisterInfo::getDomVRegDefInBasicBlock( + Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + if(I == MBB.begin()) return MBB.end(); + // Iterate backwards from I (exclusive) to the beginning of the basic block + do { + --I; + if (I->modifiesRegister(Reg, getTargetRegisterInfo())) return I; + } while (I != MBB.begin()); + return MBB.end(); +} \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index ceda928f202f5..21da6f44fe8e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1839,7 +1839,7 @@ void ControlFlowRewriter::rewrite() { Opcode = AMDGPU::S_CBRANCH_SCC1; } else { Register CondReg = Info.OrigCondition; - if (!LMA.isSubsetOfExec(CondReg, *Node->Block)) { + if (!LMA.isSubsetOfExec(CondReg, *Node->Block, Node->Block->end())) { CondReg = LMU.createLaneMaskReg(); BuildMI(*Node->Block, Node->Block->end(), {}, TII.get(LMC.AndOpc), CondReg) @@ -1937,7 +1937,7 @@ void ControlFlowRewriter::rewrite() { } } else { CondReg = LaneOrigin.CondReg; - if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block)) { + if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator())) { Register Prev = CondReg; CondReg = LMU.createLaneMaskReg(); BuildMI(*LaneOrigin.Node->Block, @@ -2033,28 +2033,33 @@ void ControlFlowRewriter::rewrite() { CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second; Register PrimaryExec = PredInfo.PrimarySuccessorExec; - MachineInstr *PrimaryExecDef; - for (;;) { - PrimaryExecDef = MRI.getVRegDef(PrimaryExec); - if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) - break; - PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); - } + //Turning off this copy-chain optimization to retain the Accumulator as the PrimaryExec + + // MachineInstr *PrimaryExecDef; + // for (;;) { + // PrimaryExecDef = MRI.getVRegDef(PrimaryExec); + // if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) + // break; + // PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); + // } // Rejoin = EXEC ^ PrimaryExec // // Fold immediately if PrimaryExec was obtained via XOR as well. Register Rejoin; - if (PrimaryExecDef->getParent() == Pred->Block && - PrimaryExecDef->getOpcode() == LMC.XorOpc && - PrimaryExecDef->getOperand(1).isReg() && - PrimaryExecDef->getOperand(2).isReg()) { - if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) - Rejoin = PrimaryExecDef->getOperand(2).getReg(); - else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) - Rejoin = PrimaryExecDef->getOperand(1).getReg(); - } + //Turning off this XOR optimiztion since buildMergeLaneMasks() will not + // introduce XOR instruction for creating the PrimaryExec + + // if (PrimaryExecDef->getParent() == Pred->Block && + // PrimaryExecDef->getOpcode() == LMC.XorOpc && + // PrimaryExecDef->getOperand(1).isReg() && + // PrimaryExecDef->getOperand(2).isReg()) { + // if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) + // Rejoin = PrimaryExecDef->getOperand(2).getReg(); + // else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) + // Rejoin = PrimaryExecDef->getOperand(1).getReg(); + // } if (!Rejoin) { // Try to find a previously generated XOR (or merely masked) value @@ -2091,7 +2096,7 @@ void ControlFlowRewriter::rewrite() { LLVM_DEBUG(Function.dump()); } - + Updater.insertAccumulatorResets(); Updater.cleanup(); } diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index d7b19cbe745a8..a31b6e361be07 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -31,13 +31,12 @@ bool GCNLaneMaskUtils::maybeLaneMask(Register Reg) const { /// Determine whether the lane-mask register \p Reg is a wave-wide constant. /// If so, the value is stored in \p Val. -bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val) const { +bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { MachineRegisterInfo &MRI = MF.getRegInfo(); - const MachineInstr *MI; for (;;) { - MI = MRI.getVRegDef(Reg); - if (!MI) { + MI = MRI.getDomVRegDefInBasicBlock(Reg, MBB, MI); + if (MI == MBB.end()) { // This can happen when called from GCNLaneMaskUpdater, where Reg can // be a placeholder that has not yet been filled in. return false; @@ -100,18 +99,20 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// properly masked, i.e. use PrevReg directly instead of /// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg /// beyond (CurReg & EXEC). +/// \param isPrevZeroReg Indicates that PrevReg is a zero register. void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA, - bool accumulating) const { + bool accumulating, + bool isPrevZeroReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool PrevVal = false; - bool PrevConstant = !PrevReg || isConstantLaneMask(PrevReg, PrevVal); + bool PrevConstant = !PrevReg || isPrevZeroReg; bool CurVal = false; - bool CurConstant = isConstantLaneMask(CurReg, CurVal); + bool CurConstant = isConstantLaneMask(CurReg, CurVal, MBB, I); assert(PrevReg || !accumulating); @@ -147,7 +148,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, } if (!CurConstant) { if ((PrevConstant && PrevVal) || - (LMA && LMA->isSubsetOfExec(CurReg, MBB))) { + (LMA && LMA->isSubsetOfExec(CurReg, MBB, I))) { CurMaskedReg = CurReg; } else { CurMaskedReg = createLaneMaskReg(); @@ -188,22 +189,26 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, /// (Reg & EXEC) == Reg when used in \p UseBlock. bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + MachineBasicBlock::iterator I, unsigned RemainingDepth) { MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); - MachineInstr *DefInstr = nullptr; + MachineBasicBlock::iterator DefInstr = UseBlock.end(); const AMDGPU::LaneMaskConstants &LMC = LMU.getLaneMaskConsts(); for (;;) { if (!Register::isVirtualRegister(Reg)) { if (Reg == LMC.ExecReg && - (!DefInstr || DefInstr->getParent() == &UseBlock)) + (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)) return true; return false; } - DefInstr = MRI.getVRegDef(Reg); + DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I); + if(DefInstr == UseBlock.end()) + return false; if (DefInstr->getOpcode() == AMDGPU::COPY) { Reg = DefInstr->getOperand(1).getReg(); + I = DefInstr; continue; } @@ -242,7 +247,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, if ((LikeOr || IsAnd || IsAndN2) && (DefInstr->getOperand(1).isReg() && DefInstr->getOperand(2).isReg())) { bool FirstIsSubset = isSubsetOfExec(DefInstr->getOperand(1).getReg(), - UseBlock, RemainingDepth); + UseBlock, DefInstr, RemainingDepth); if (!FirstIsSubset && (LikeOr || IsAndN2)) return SubsetOfExec.try_emplace(Reg, false).first->second; @@ -252,7 +257,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, } bool SecondIsSubset = isSubsetOfExec(DefInstr->getOperand(2).getReg(), - UseBlock, RemainingDepth); + UseBlock, DefInstr, RemainingDepth); if (!SecondIsSubset) return SubsetOfExec.try_emplace(Reg, false).first->second; @@ -268,14 +273,14 @@ void GCNLaneMaskUpdater::init(Register Reg) { Processed = false; Blocks.clear(); // SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC); - SSAUpdater.Initialize(Reg); + Accumulator = {}; } /// Optional cleanup, may remove stray instructions. void GCNLaneMaskUpdater::cleanup() { Processed = false; Blocks.clear(); - + Accumulator = {}; MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); if (ZeroReg && MRI.use_empty(ZeroReg)) { @@ -330,7 +335,7 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block, Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { if (!Processed) process(); - return SSAUpdater.GetValueInMiddleOfBlock(&Block); + return Accumulator; } /// Return the value at the end of the given block, i.e. after any change that @@ -342,7 +347,7 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { if (!Processed) process(); - return SSAUpdater.GetValueAtEndOfBlock(&Block); + return Accumulator; } /// Return the value in \p Block after the value merge (if any). @@ -352,15 +357,15 @@ Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { auto BlockIt = findBlockInfo(Block); if (BlockIt != Blocks.end()) { - if (BlockIt->Merged) - return BlockIt->Merged; + if (BlockIt->Value) + return Accumulator; if (BlockIt->Flags & ResetInMiddle) return ZeroReg; } // We didn't merge anything in the block, but the block may still be // ResetAtEnd, in which case we need the pre-reset value. - return SSAUpdater.GetValueInMiddleOfBlock(&Block); + return Accumulator; } /// Determine whether \p MI defines and/or uses SCC. @@ -422,22 +427,22 @@ void GCNLaneMaskUpdater::process() { .addImm(0); } - // Add available values. + if (!Accumulator) { + Accumulator = LMU.createLaneMaskReg(); + BuildMI(Entry, Entry.getFirstTerminator(), {}, + TII->get(LMU.getLaneMaskConsts().MovOpc), Accumulator) + .addImm(0); + } + + // Reset accumulator. for (BlockInfo &Info : Blocks) { assert(Accumulating || !Info.Flags); assert(Info.Flags || Info.Value); - if (Info.Value) - Info.Merged = LMU.createLaneMaskReg(); - - SSAUpdater.AddAvailableValue( - Info.Block, - (Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg); + if(!Info.Value || (Info.Flags & ResetAtEnd)) + AccumulatorResetBlocks[Info.Block].insert(Accumulator); } - if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry)) - SSAUpdater.AddAvailableValue(&Entry, ZeroReg); - // Once the SSA updater is ready, we can fill in all merge code, relying // on the SSA updater to insert required PHIs. for (BlockInfo &Info : Blocks) { @@ -448,11 +453,8 @@ void GCNLaneMaskUpdater::process() { Register Previous; if (Info.Block != &LMU.function()->front() && !(Info.Flags & ResetInMiddle)) { - Previous = SSAUpdater.GetValueInMiddleOfBlock(Info.Block); - if (Accumulating) { - assert(!MRI.getVRegDef(Previous) || - MRI.getVRegDef(Previous)->getOpcode() != AMDGPU::IMPLICIT_DEF); - } else { + Previous = Accumulator; + if (!Accumulating) { MachineInstr *PrevInstr = MRI.getVRegDef(Previous); if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) { PotentiallyDead.insert(PrevInstr); @@ -466,18 +468,20 @@ void GCNLaneMaskUpdater::process() { // Insert merge logic. MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block); - LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Info.Merged, Previous, - Info.Value, LMA, Accumulating); - - if (Info.Flags & ResetAtEnd) { - MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); - if (mergeInstr->getOpcode() == AMDGPU::COPY && - mergeInstr->getOperand(1).getReg().isVirtual()) { - assert(MRI.use_empty(Info.Merged)); - Info.Merged = mergeInstr->getOperand(1).getReg(); - mergeInstr->eraseFromParent(); - } - } + LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, + Info.Value, LMA, Accumulating, Previous == ZeroReg); + + + // Switching off this optimization, since Accumulator will always have a use + // if (Info.Flags & ResetAtEnd) { + // MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); + // if (mergeInstr->getOpcode() == AMDGPU::COPY && + // mergeInstr->getOperand(1).getReg().isVirtual()) { + // assert(MRI.use_empty(Info.Merged)); + // Info.Merged = mergeInstr->getOperand(1).getReg(); + // mergeInstr->eraseFromParent(); + // } + // } } Processed = true; @@ -489,3 +493,18 @@ GCNLaneMaskUpdater::findBlockInfo(MachineBasicBlock &Block) { return llvm::find_if( Blocks, [&](const auto &Entry) { return Entry.Block == &Block; }); } + +void GCNLaneMaskUpdater::insertAccumulatorResets() { + const SIInstrInfo *TII = LMU.function()->getSubtarget().getInstrInfo(); + for (auto &Entry : AccumulatorResetBlocks) { + MachineBasicBlock *B = Entry.first; + DenseSet &Accumulators = Entry.second; + for (Register ACC : Accumulators) { + //get first branch instruction + MachineBasicBlock::iterator I = B->getFirstTerminator(); + while(I != B->end() && !I->isBranch()) I++; + if(I == B->end()) I--; + BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC).addImm(0); + } + } +} diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index f4419f139d92c..255dd8ff89080 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -43,14 +43,15 @@ class GCNLaneMaskUtils { const AMDGPU::LaneMaskConstants &getLaneMaskConsts() const { return LMC; } bool maybeLaneMask(Register Reg) const; - bool isConstantLaneMask(Register Reg, bool &Val) const; + bool isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; Register createLaneMaskReg() const; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA = nullptr, - bool Accumulating = false) const; + bool Accumulating = false, + bool isPrevZeroReg = false) const; }; /// Lazy analyses of lane masks. @@ -63,7 +64,7 @@ class GCNLaneMaskAnalysis { public: GCNLaneMaskAnalysis(MachineFunction &MF) : LMU(MF) {} - bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, MachineBasicBlock::iterator I, unsigned RemainingDepth = 5); }; @@ -105,7 +106,6 @@ class GCNLaneMaskUpdater { private: GCNLaneMaskUtils LMU; GCNLaneMaskAnalysis *LMA = nullptr; - MachineSSAUpdater SSAUpdater; bool Accumulating = false; @@ -115,7 +115,6 @@ class GCNLaneMaskUpdater { MachineBasicBlock *Block; unsigned Flags = 0; // ResetFlags Register Value; - Register Merged; explicit BlockInfo(MachineBasicBlock *Block) : Block(Block) {} }; @@ -124,9 +123,11 @@ class GCNLaneMaskUpdater { Register ZeroReg; DenseSet PotentiallyDead; - + DenseMap> AccumulatorResetBlocks; public: - GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), SSAUpdater(MF) {} + Register Accumulator; + + GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF) {} void setLaneMaskAnalysis(GCNLaneMaskAnalysis *Analysis) { LMA = Analysis; } @@ -141,7 +142,7 @@ class GCNLaneMaskUpdater { Register getValueInMiddleOfBlock(MachineBasicBlock &Block); Register getValueAtEndOfBlock(MachineBasicBlock &Block); Register getValueAfterMerge(MachineBasicBlock &Block); - + void insertAccumulatorResets(); private: void process(); SmallVectorImpl::iterator findBlockInfo(MachineBasicBlock &Block); From 54b5f3b1857989676755e0a0e6d757a31077c6f3 Mon Sep 17 00:00:00 2001 From: lalaniket8 Date: Tue, 9 Dec 2025 16:28:33 +0530 Subject: [PATCH 2/4] sanitized with git clang format and minor fixes --- .../llvm/CodeGen/MachineRegisterInfo.h | 7 ++-- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 13 ++++--- .../lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 10 +++--- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 36 ++++++++++--------- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h | 12 ++++--- 5 files changed, 44 insertions(+), 34 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 7195f75ddc69b..a7ac65287284b 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -619,8 +619,11 @@ class MachineRegisterInfo { /// getDomVRegDefInBasicBlock - Return the last machine instr that defines /// the specified virtual register in the basic block, searching backwards - /// from instruction I (exclusive). Returns MBB.end() if no definition is found. - LLVM_ABI MachineBasicBlock::iterator getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + /// from instruction I (exclusive). Returns MBB.end() if no definition is + /// found. + LLVM_ABI MachineBasicBlock::iterator + getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; /// getUniqueVRegDef - Return the unique machine instr that defines the /// specified virtual register or null if none is found. If there are diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 93bddc7c33928..d2dea61f138bb 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -675,16 +675,15 @@ bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const { return false; } -/// getDomVRegDefInBasicBlock - Return the last machine instr that defines -/// the specified virtual register in the basic block, searching backwards -/// from instruction I (exclusive). Returns MBB.end() if no definition is found. MachineBasicBlock::iterator MachineRegisterInfo::getDomVRegDefInBasicBlock( - Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - if(I == MBB.begin()) return MBB.end(); + Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + if (I == MBB.begin()) + return MBB.end(); // Iterate backwards from I (exclusive) to the beginning of the basic block do { --I; - if (I->modifiesRegister(Reg, getTargetRegisterInfo())) return I; + if (I->modifiesRegister(Reg, getTargetRegisterInfo())) + return I; } while (I != MBB.begin()); return MBB.end(); -} \ No newline at end of file +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index 21da6f44fe8e7..7cbc11b1a5d85 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1937,7 +1937,8 @@ void ControlFlowRewriter::rewrite() { } } else { CondReg = LaneOrigin.CondReg; - if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator())) { + if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block, + LaneOrigin.Node->Block->getFirstTerminator())) { Register Prev = CondReg; CondReg = LMU.createLaneMaskReg(); BuildMI(*LaneOrigin.Node->Block, @@ -2033,7 +2034,8 @@ void ControlFlowRewriter::rewrite() { CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second; Register PrimaryExec = PredInfo.PrimarySuccessorExec; - //Turning off this copy-chain optimization to retain the Accumulator as the PrimaryExec + // Turning off this copy-chain optimization to retain the Accumulator as + // the PrimaryExec // MachineInstr *PrimaryExecDef; // for (;;) { @@ -2048,8 +2050,8 @@ void ControlFlowRewriter::rewrite() { // Fold immediately if PrimaryExec was obtained via XOR as well. Register Rejoin; - //Turning off this XOR optimiztion since buildMergeLaneMasks() will not - // introduce XOR instruction for creating the PrimaryExec + // Turning off this XOR optimiztion since buildMergeLaneMasks() will not + // introduce XOR instruction for creating the PrimaryExec // if (PrimaryExecDef->getParent() == Pred->Block && // PrimaryExecDef->getOpcode() == LMC.XorOpc && diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index a31b6e361be07..1aeeef609802c 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -31,7 +31,9 @@ bool GCNLaneMaskUtils::maybeLaneMask(Register Reg) const { /// Determine whether the lane-mask register \p Reg is a wave-wide constant. /// If so, the value is stored in \p Val. -bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { +bool GCNLaneMaskUtils::isConstantLaneMask( + Register Reg, bool &Val, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { MachineRegisterInfo &MRI = MF.getRegInfo(); for (;;) { @@ -100,13 +102,10 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg /// beyond (CurReg & EXEC). /// \param isPrevZeroReg Indicates that PrevReg is a zero register. -void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, Register DstReg, - Register PrevReg, Register CurReg, - GCNLaneMaskAnalysis *LMA, - bool accumulating, - bool isPrevZeroReg) const { +void GCNLaneMaskUtils::buildMergeLaneMasks( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, + Register DstReg, Register PrevReg, Register CurReg, + GCNLaneMaskAnalysis *LMA, bool accumulating, bool isPrevZeroReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool PrevVal = false; @@ -198,13 +197,13 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, for (;;) { if (!Register::isVirtualRegister(Reg)) { if (Reg == LMC.ExecReg && - (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)) + (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)) return true; return false; } DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I); - if(DefInstr == UseBlock.end()) + if (DefInstr == UseBlock.end()) return false; if (DefInstr->getOpcode() == AMDGPU::COPY) { Reg = DefInstr->getOperand(1).getReg(); @@ -439,7 +438,7 @@ void GCNLaneMaskUpdater::process() { assert(Accumulating || !Info.Flags); assert(Info.Flags || Info.Value); - if(!Info.Value || (Info.Flags & ResetAtEnd)) + if (!Info.Value || (Info.Flags & ResetAtEnd)) AccumulatorResetBlocks[Info.Block].insert(Accumulator); } @@ -471,7 +470,6 @@ void GCNLaneMaskUpdater::process() { LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, Info.Value, LMA, Accumulating, Previous == ZeroReg); - // Switching off this optimization, since Accumulator will always have a use // if (Info.Flags & ResetAtEnd) { // MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); @@ -495,16 +493,20 @@ GCNLaneMaskUpdater::findBlockInfo(MachineBasicBlock &Block) { } void GCNLaneMaskUpdater::insertAccumulatorResets() { - const SIInstrInfo *TII = LMU.function()->getSubtarget().getInstrInfo(); + const SIInstrInfo *TII = + LMU.function()->getSubtarget().getInstrInfo(); for (auto &Entry : AccumulatorResetBlocks) { MachineBasicBlock *B = Entry.first; DenseSet &Accumulators = Entry.second; for (Register ACC : Accumulators) { - //get first branch instruction + // Get first branch instruction. MachineBasicBlock::iterator I = B->getFirstTerminator(); - while(I != B->end() && !I->isBranch()) I++; - if(I == B->end()) I--; - BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC).addImm(0); + while (I != B->end() && !I->isBranch()) + I++; + if (I == B->end()) + I--; + BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC) + .addImm(0); } } } diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index 255dd8ff89080..3fdb3b277cf06 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -43,14 +43,15 @@ class GCNLaneMaskUtils { const AMDGPU::LaneMaskConstants &getLaneMaskConsts() const { return LMC; } bool maybeLaneMask(Register Reg) const; - bool isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + bool isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; Register createLaneMaskReg() const; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA = nullptr, - bool Accumulating = false, + bool Accumulating = false, bool isPrevZeroReg = false) const; }; @@ -64,7 +65,8 @@ class GCNLaneMaskAnalysis { public: GCNLaneMaskAnalysis(MachineFunction &MF) : LMU(MF) {} - bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, MachineBasicBlock::iterator I, + bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + MachineBasicBlock::iterator I, unsigned RemainingDepth = 5); }; @@ -123,7 +125,8 @@ class GCNLaneMaskUpdater { Register ZeroReg; DenseSet PotentiallyDead; - DenseMap> AccumulatorResetBlocks; + DenseMap> AccumulatorResetBlocks; + public: Register Accumulator; @@ -143,6 +146,7 @@ class GCNLaneMaskUpdater { Register getValueAtEndOfBlock(MachineBasicBlock &Block); Register getValueAfterMerge(MachineBasicBlock &Block); void insertAccumulatorResets(); + private: void process(); SmallVectorImpl::iterator findBlockInfo(MachineBasicBlock &Block); From 7a31f7f5373a7ea68cd9565d94840535034c7ef4 Mon Sep 17 00:00:00 2001 From: lalaniket8 Date: Wed, 10 Dec 2025 14:56:06 +0530 Subject: [PATCH 3/4] Removed default mode and cleanup --- .../lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 1 - llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 68 ++++++------------- llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h | 23 ++----- 3 files changed, 26 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index 7cbc11b1a5d85..96d197c096aa9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1876,7 +1876,6 @@ void ControlFlowRewriter::rewrite() { RegMap; GCNLaneMaskUpdater Updater(Function); Updater.setLaneMaskAnalysis(&LMA); - Updater.setAccumulating(true); for (WaveNode *LaneTarget : NodeOrder) { CFGNodeInfo &LaneTargetInfo = NodeInfo.find(LaneTarget)->second; diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index 1aeeef609802c..8b5b915473570 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -84,28 +84,25 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// Insert the moral equivalent of /// -/// DstReg = (PrevReg & ~EXEC) | (CurReg & EXEC) +/// DstReg = PrevReg | (CurReg & EXEC) /// /// before \p I in basic block \p MBB. Some simplifications are applied on the -/// fly based on constant inputs and analysis via \p LMA, and further -/// simplifications can be requested in "accumulating" mode. +/// fly based on constant inputs and analysis via \p LMA /// /// \param DstReg The virtual register into which the merged mask is written. /// \param PrevReg The virtual register with the "previous" lane mask value; -/// may be null to indicate an undef value. +/// may be ZeroReg or Accumulator. /// \param CurReg The virtual register with the "current" lane mask value to /// be merged into "previous". /// \param LMA If non-null, used to test whether CurReg may already be a subset /// of EXEC. -/// \param accumulating Indicates that we should assume PrevReg is already -/// properly masked, i.e. use PrevReg directly instead of -/// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg -/// beyond (CurReg & EXEC). /// \param isPrevZeroReg Indicates that PrevReg is a zero register. -void GCNLaneMaskUtils::buildMergeLaneMasks( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - Register DstReg, Register PrevReg, Register CurReg, - GCNLaneMaskAnalysis *LMA, bool accumulating, bool isPrevZeroReg) const { +void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, Register DstReg, + Register PrevReg, Register CurReg, + GCNLaneMaskAnalysis *LMA, + bool isPrevZeroReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool PrevVal = false; @@ -113,7 +110,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks( bool CurVal = false; bool CurConstant = isConstantLaneMask(CurReg, CurVal, MBB, I); - assert(PrevReg || !accumulating); + assert(PrevReg); if (PrevConstant && CurConstant) { if (PrevVal == CurVal) { @@ -135,15 +132,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks( Register PrevMaskedReg; Register CurMaskedReg; if (!PrevConstant) { - if (accumulating || (CurConstant && CurVal)) { - PrevMaskedReg = PrevReg; - } else { - PrevMaskedReg = createLaneMaskReg(); - PrevMaskedBuilt = - BuildMI(MBB, I, DL, TII->get(LMC.AndN2Opc), PrevMaskedReg) - .addReg(PrevReg) - .addReg(LMC.ExecReg); - } + PrevMaskedReg = PrevReg; } if (!CurConstant) { if ((PrevConstant && PrevVal) || @@ -157,8 +146,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks( } } - // TODO-NOW: reevaluate the masking logic in case of CurConstant && CurVal && - // accumulating + // TODO-NOW: reevaluate the masking logic in case of CurConstant && CurVal if (PrevConstant && !PrevVal) { if (CurMaskedBuilt) { @@ -272,19 +260,19 @@ void GCNLaneMaskUpdater::init(Register Reg) { Processed = false; Blocks.clear(); // SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC); - Accumulator = {}; + Accumulator = AMDGPU::NoRegister; } /// Optional cleanup, may remove stray instructions. void GCNLaneMaskUpdater::cleanup() { Processed = false; Blocks.clear(); - Accumulator = {}; + Accumulator = AMDGPU::NoRegister; MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); if (ZeroReg && MRI.use_empty(ZeroReg)) { MRI.getVRegDef(ZeroReg)->eraseFromParent(); - ZeroReg = {}; + ZeroReg = AMDGPU::NoRegister; } for (MachineInstr *MI : PotentiallyDead) { @@ -340,7 +328,7 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { /// Return the value at the end of the given block, i.e. after any change that /// was registered via \ref addAvailable. /// -/// Note: If \p Block is the reset block in accumulating mode with ResetAtEnd +/// Note: If \p Block is the reset block with ResetAtEnd /// reset mode, then this value will be 0. You likely want /// \ref getPreReset instead. Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { @@ -418,8 +406,7 @@ void GCNLaneMaskUpdater::process() { LMU.function()->getSubtarget().getInstrInfo(); MachineBasicBlock &Entry = LMU.function()->front(); - // Prepare an all-zero value for the default and reset in accumulating mode. - if (Accumulating && !ZeroReg) { + if (!ZeroReg) { ZeroReg = LMU.createLaneMaskReg(); BuildMI(Entry, Entry.getFirstTerminator(), {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ZeroReg) @@ -435,7 +422,6 @@ void GCNLaneMaskUpdater::process() { // Reset accumulator. for (BlockInfo &Info : Blocks) { - assert(Accumulating || !Info.Flags); assert(Info.Flags || Info.Value); if (!Info.Value || (Info.Flags & ResetAtEnd)) @@ -448,27 +434,17 @@ void GCNLaneMaskUpdater::process() { if (!Info.Value) continue; - // Determine the "previous" value, if any. + // Determine the "previous" value. Register Previous; - if (Info.Block != &LMU.function()->front() && - !(Info.Flags & ResetInMiddle)) { + if (Info.Block != &LMU.function()->front() && !(Info.Flags & ResetInMiddle)) Previous = Accumulator; - if (!Accumulating) { - MachineInstr *PrevInstr = MRI.getVRegDef(Previous); - if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) { - PotentiallyDead.insert(PrevInstr); - Previous = {}; - } - } - } else { - if (Accumulating) - Previous = ZeroReg; - } + else + Previous = ZeroReg; // Insert merge logic. MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block); LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, - Info.Value, LMA, Accumulating, Previous == ZeroReg); + Info.Value, LMA, Previous == ZeroReg); // Switching off this optimization, since Accumulator will always have a use // if (Info.Flags & ResetAtEnd) { diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index 3fdb3b277cf06..1998801156ed6 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -51,7 +51,6 @@ class GCNLaneMaskUtils { MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA = nullptr, - bool Accumulating = false, bool isPrevZeroReg = false) const; }; @@ -72,23 +71,13 @@ class GCNLaneMaskAnalysis { /// \brief SSA-updater for lane masks. /// -/// The updater operates in one of two modes: "default" and "accumulating". -/// -/// Default mode is the analog to regular SSA construction and suitable for the -/// lowering of normal per-lane boolean values to lane masks: the mask can be -/// (re-)written multiple times for each lane. In each basic block, only the -/// lanes enabled by that block's EXEC mask are updated. Bits for lanes that -/// never contributed with an available value are undefined. -/// -/// Accumulating mode is used for some aspects of control flow lowering. In -/// this mode, each lane is assumed to provide a "true" available value only +/// Each lane is assumed to provide a "true" available value only /// once, and to never attempt to change the value back to "false" -- except /// that all lanes are reset to false in "reset blocks" as explained below. -/// In accumulating mode, the bits for lanes that never contributed with an -/// available value are 0. +/// The bits for lanes that never contributed with an available value are 0. /// -/// In accumulating mode, all lanes are reset to 0 at certain points in "reset -/// blocks" which are added via \ref addReset. The reset happens in one or both +/// All lanes are reset to 0 at certain points in "reset blocks" +/// which are added via \ref addReset. The reset happens in one or both /// of two modes: /// - ResetInMiddle: Reset logically happens after the point queried by /// \ref getValueInMiddleOfBlock and before the contribution of the block's @@ -109,8 +98,6 @@ class GCNLaneMaskUpdater { GCNLaneMaskUtils LMU; GCNLaneMaskAnalysis *LMA = nullptr; - bool Accumulating = false; - bool Processed = false; struct BlockInfo { @@ -137,8 +124,6 @@ class GCNLaneMaskUpdater { void init(Register Reg); void cleanup(); - void setAccumulating(bool Val) { Accumulating = Val; } - void addReset(MachineBasicBlock &Block, ResetFlags Flags); void addAvailable(MachineBasicBlock &Block, Register Value); From 35f7d2c85d002599756b84d4aad39932fbc2293d Mon Sep 17 00:00:00 2001 From: anikelal Date: Tue, 16 Dec 2025 18:14:19 +0530 Subject: [PATCH 4/4] Move getDomVRegDefInBasicBlock() into SIRegisterInfo.cpp, replace MovTermOpc operations with MovOpc --- llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 8 -------- llvm/lib/CodeGen/MachineRegisterInfo.cpp | 13 ------------- llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp | 8 ++++++++ llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp | 7 +++++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 15 +++++++++++++++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 9 +++++++++ 6 files changed, 37 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index a7ac65287284b..737b74ef3f761 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -617,14 +617,6 @@ class MachineRegisterInfo { /// form, so there should only be one definition. LLVM_ABI MachineInstr *getVRegDef(Register Reg) const; - /// getDomVRegDefInBasicBlock - Return the last machine instr that defines - /// the specified virtual register in the basic block, searching backwards - /// from instruction I (exclusive). Returns MBB.end() if no definition is - /// found. - LLVM_ABI MachineBasicBlock::iterator - getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; - /// getUniqueVRegDef - Return the unique machine instr that defines the /// specified virtual register or null if none is found. If there are /// multiple definitions or no definition, return null. diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index d2dea61f138bb..094315b3903ea 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -674,16 +674,3 @@ bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const { } return false; } - -MachineBasicBlock::iterator MachineRegisterInfo::getDomVRegDefInBasicBlock( - Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - if (I == MBB.begin()) - return MBB.end(); - // Iterate backwards from I (exclusive) to the beginning of the basic block - do { - --I; - if (I->modifiesRegister(Reg, getTargetRegisterInfo())) - return I; - } while (I != MBB.begin()); - return MBB.end(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index 96d197c096aa9..27759c4a3ca3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -2098,6 +2098,14 @@ void ControlFlowRewriter::rewrite() { LLVM_DEBUG(Function.dump()); } Updater.insertAccumulatorResets(); + // Replace MovTermOpc with MovOpc + for (MachineBasicBlock &MBB : Function) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == LMC.MovTermOpc) { + MI.setDesc(TII.get(LMC.MovOpc)); + } + } + } Updater.cleanup(); } diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index 8b5b915473570..0b2cda411e91e 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -10,6 +10,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -37,7 +38,8 @@ bool GCNLaneMaskUtils::isConstantLaneMask( MachineRegisterInfo &MRI = MF.getRegInfo(); for (;;) { - MI = MRI.getDomVRegDefInBasicBlock(Reg, MBB, MI); + MI = SIRegisterInfo::getDomVRegDefInBasicBlock(Reg, MBB, MI, + MRI.getTargetRegisterInfo()); if (MI == MBB.end()) { // This can happen when called from GCNLaneMaskUpdater, where Reg can // be a placeholder that has not yet been filled in. @@ -190,7 +192,8 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, return false; } - DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I); + DefInstr = SIRegisterInfo::getDomVRegDefInBasicBlock( + Reg, UseBlock, I, MRI.getTargetRegisterInfo()); if (DefInstr == UseBlock.end()) return false; if (DefInstr->getOpcode() == AMDGPU::COPY) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 10af38c637a39..7c9a82291f4ae 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4182,6 +4182,21 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } +MachineBasicBlock::iterator +SIRegisterInfo::getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterInfo *TRI) { + if (I == MBB.begin()) + return MBB.end(); + // Iterate backwards from I (exclusive) to the beginning of the basic block + do { + --I; + if (I->definesRegister(Reg, TRI)) + return I; + } while (I != MBB.begin()); + return MBB.end(); +} + // Find reaching register definition MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index bbb32397bc5a5..cf4a2945393ed 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -417,6 +417,15 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { MachineRegisterInfo &MRI, LiveIntervals *LIS) const; + /// getDomVRegDefInBasicBlock - Return the last machine instr that defines + /// the specified virtual register in the basic block, searching backwards + /// from instruction I (inclusive). Returns MBB.end() if no definition is + /// found. + static MachineBasicBlock::iterator + getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterInfo *TRI); + const uint32_t *getAllVGPRRegMask() const; const uint32_t *getAllAGPRRegMask() const; const uint32_t *getAllVectorRegMask() const;